library(cowplot)
library(ggforce)
library(ggrepel)
library(ggpubr)
library(knitr)
library(ggrastr)
library(DescTools)
library(paletteer)
library(ggalluvial)
library(plotly)
library(ggbeeswarm)
library(h2o)
library(ggokabeito)
library(scico)
library(tidyverse)
theme_set(theme_bw())
###################################################################################################
# args = commandArgs(TRUE)
#
# argmat = sapply(strsplit(args, "="), identity)
#
# for (i in seq.int(length = ncol(argmat))) {
# assign(argmat[1, i], argmat[2, i])
# }
#
# # available variables
# print(ls())
###################################################################################################
############################################################################################### ############################################################################################### ############################################################################################### # ########################### stacked bar-chart WT vs YB and WT vs AubAGO3 ############################################### ############################################################################################### ############################################################################################### ###############################################################################################
# read in the data
RAW = read_tsv("quantified-sources.txt") %>%
filter(ANN != "miRNA")
## Rows: 69 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (6): GENO, TYPE, ANN, CLUSTER, SOURCE, TEtype
## dbl (1): COUNT
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
#print table to give overview
RAW
## # A tibble: 65 × 7
## GENO TYPE ANN COUNT CLUSTER SOURCE TEtype
## <chr> <chr> <chr> <dbl> <chr> <chr> <chr>
## 1 wsh_GLKD_MAbAgo3-IP_ov sRNA 3UTR 22.2 Y N <NA>
## 2 wsh_GLKD_MAbAgo3-IP_ov sRNA 3UTR 682199 N N <NA>
## 3 wsh_GLKD_MAbAgo3-IP_ov sRNA 5UTR 171963 N N <NA>
## 4 wsh_GLKD_MAbAgo3-IP_ov sRNA CDS 1602970 N N <NA>
## 5 wsh_GLKD_MAbAgo3-IP_ov sRNA CDS 44.5 Y N <NA>
## 6 wsh_GLKD_MAbAgo3-IP_ov sRNA INTRON 477308 N N <NA>
## 7 wsh_GLKD_MAbAgo3-IP_ov sRNA INTRON 7008. Y N <NA>
## 8 wsh_GLKD_MAbAgo3-IP_ov sRNA NONE 1406030 Y N <NA>
## 9 wsh_GLKD_MAbAgo3-IP_ov sRNA NONE 25349900 N N <NA>
## 10 wsh_GLKD_MAbAgo3-IP_ov sRNA TE:!:active 15660700 N N active
## # ℹ 55 more rows
PLOTtable = RAW %>%
filter(GENO %in% c("Wsh_nGFP-Piwi-PiwiIP","AUBshAGO3sh_nGFP-Piwi-PiwiIP")) %>%
mutate(
COUNT = case_when(
ANN == "INTRON" ~ 0,
TRUE ~ COUNT
),
GENO = case_when(
GENO == "Wsh_nGFP-Piwi-PiwiIP" ~ "WT",
GENO == "AUBshAGO3sh_nGFP-Piwi-PiwiIP" ~ "AubAGO3"
),
GENO = factor(GENO, levels = c("WT","AubAGO3"))
)%>%
separate(ANN, c("ANN","TEtype"), sep = ":!:", remove = TRUE)%>%
group_by(GENO, TYPE,ANN) %>%
summarise(COUNT = sum(COUNT)) %>%
mutate(
COUNT = COUNT / sum(COUNT)
)
PLOTtable$ANN = factor(PLOTtable$ANN, levels = c("INTRON" ,"5UTR","CDS", "3UTR","NONE","TE", "TE_AS" ))
p = PLOTtable %>%
ggplot(aes(x=GENO, y=COUNT, alluvium=ANN, fill=ANN))+
scale_x_discrete(expand = c(.2, .05)) +
geom_bar(stat = "identity", width = 0.2, color = "white", size = 0.2) +
geom_alluvium(width = 0.2, alpha = 0.7,
curve_type = "sigmoid")+ # smoother curves
scale_fill_scico_d(palette = "navia", direction = -1) +
scale_y_continuous(labels = scales::percent_format()) +
theme_cowplot(12) +
theme(
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
legend.position = "right",
axis.text.x = element_text(angle = 45, hjust = 1) # if labels are long
) +
labs(x = NULL, y = "Proportion", fill = "Annotation")
print(p)
print(p)
ggsave(paste("stackedBars_WT_vs_AubAGO3.pdf",sep=""), p, width = 3, height = 5, units = "in", dpi = 300)
PLOTtable = RAW %>%
filter(GENO %in% c("wsh_GLKD_MAbAgo3-IP_ov" ,"wsh_GLKD_MAbAub-IP_ov","Wsh_nGFP-Piwi-PiwiIP")) %>%
mutate(
COUNT = case_when(
ANN == "INTRON" ~ 0,
TRUE ~ COUNT
),
# GENO = case_when(
# GENO == "Wsh_nGFP-Piwi-PiwiIP" ~ "WT",
# GENO == "AUBshAGO3sh_nGFP-Piwi-PiwiIP" ~ "AubAGO3"
# ),
GENO = factor(GENO, levels = c("Wsh_nGFP-Piwi-PiwiIP","wsh_GLKD_MAbAub-IP_ov","wsh_GLKD_MAbAgo3-IP_ov" ))
)%>%
separate(ANN, c("ANN","TEtype"), sep = ":!:", remove = TRUE)%>%
group_by(GENO, TYPE,ANN) %>%
summarise(COUNT = sum(COUNT)) %>%
mutate(
COUNT = COUNT / sum(COUNT)
)
PLOTtable$ANN = factor(PLOTtable$ANN, levels = c("INTRON" ,"5UTR","CDS", "3UTR","NONE","TE", "TE_AS" ))
p = PLOTtable %>%
ggplot(aes(x=GENO, y=COUNT, alluvium=ANN, fill=ANN))+
scale_x_discrete(expand = c(.2, .05)) +
geom_bar(stat = "identity", width = 0.2, color = "white", size = 0.2) +
geom_alluvium(width = 0.2, alpha = 0.7,
curve_type = "sigmoid")+ # smoother curves
scale_fill_scico_d(palette = "navia", direction = -1) +
scale_y_continuous(labels = scales::percent_format()) +
theme_cowplot(12) +
theme(
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
legend.position = "right",
axis.text.x = element_text(angle = 45, hjust = 1) # if labels are long
) +
labs(x = NULL, y = "Proportion", fill = "Annotation")
print(p)
ggsave(paste("PIWI-IPs.pdf",sep=""), p, width = 3, height = 5, units = "in", dpi = 300)
############################################################################################### ############################################################################################### ############################################################################################### # ########################### size profile ############################################### ############################################################################################### ############################################################################################### ###############################################################################################
# read in the data
RAW = read_tsv("size-profile_perLIB.WT.txt")
## Rows: 90 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (1): LIB
## dbl (2): LENGTH, COUNT
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
#barplot with average + error bars
SUMMARY_WT = RAW %>%
group_by(LENGTH) %>%
summarise(
mean_COUNT = mean(COUNT),
sd_COUNT = sd(COUNT)
)
WT = RAW%>%
filter(LENGTH>23)%>%
group_by(LIB)%>%
summarise(
SUM= sum(COUNT)
)%>%
ungroup()%>%
summarise(
mean_COUNT = mean(SUM),
sd_COUNT = sd(SUM)
)
RAW%>%
filter(LENGTH>23)%>%
group_by(LIB)%>%
summarise(totalCOUNT=sum(COUNT))
## # A tibble: 5 × 2
## LIB totalCOUNT
## <chr> <dbl>
## 1 siGFP_OSC_HQK_R1_97373 17006679.
## 2 siGFP_OSC_HQK_R2_98963 18650708.
## 3 siLuc_sRNA_Rep1_243678 18402286.
## 4 siLuc_sRNA_Rep2_243679 17815421.
## 5 siLuc_sRNA_Rep3_243680 13120972.
p2 <- ggplot(SUMMARY_WT, aes(x=LENGTH, y=mean_COUNT)) +
geom_bar(stat="identity", fill="black", color="black") +
geom_point(data=RAW, aes(x=LENGTH, y=COUNT, color=LIB), size=2) +
geom_errorbar(aes(ymin=mean_COUNT - sd_COUNT, ymax=mean_COUNT + sd_COUNT), width=0.2) +
scale_y_continuous(limits=c(0,5000000))+
theme_cowplot() +
labs(title="Average Size profile WT", x="sRNA Length (nt)", y="Average reads") +
theme(legend.position="right", plot.title = element_text(hjust = 0.5))
p2
ggsave(filename="size-profile_perLIB_average_SD.WT.pdf", plot=p2, width=8, height=6, dpi=300)
RAW = read_tsv("size-profile_perLIB.YB.txt")
## Rows: 72 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (1): LIB
## dbl (2): LENGTH, COUNT
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
#barplot with average + error bars
SUMMARY_YB = RAW %>%
group_by(LENGTH) %>%
summarise(
mean_COUNT = mean(COUNT),
sd_COUNT = sd(COUNT)
)
YB = RAW%>%
filter(LENGTH>23)%>%
group_by(LIB)%>%
summarise(
SUM= sum(COUNT)
)%>%
ungroup()%>%
summarise(
mean_COUNT = mean(SUM),
sd_COUNT = sd(SUM)
)
p2 <- ggplot(SUMMARY_YB, aes(x=LENGTH, y=mean_COUNT)) +
geom_bar(stat="identity", fill="black", color="black") +
geom_point(data=RAW, aes(x=LENGTH, y=COUNT, color=LIB), size=2) +
geom_errorbar(aes(ymin=mean_COUNT - sd_COUNT, ymax=mean_COUNT + sd_COUNT), width=0.2) +
scale_y_continuous(limits=c(0,5000000))+
theme_cowplot() +
labs(title="Average Size profile WT", x="sRNA Length (nt)", y="Average reads") +
theme(legend.position="right", plot.title = element_text(hjust = 0.5))
p2
ggsave(filename="size-profile_perLIB_average_SD.YB.pdf", plot=p2, width=8, height=6, dpi=300)
############################################################################################### ############################################################################################### ############################################################################################### # ########################### parameter sweep ############################################### ############################################################################################### ############################################################################################### ###############################################################################################
# load data
RAW = read_tsv("biogEfficiency.RNAnorm.siYB.shared.final_for-figures_2.corr.txt", col_names = TRUE)%>%
#remove lost YB library
select(-contains("243686"))
## New names:
## Rows: 378 Columns: 273
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "\t" chr
## (6): TYPE, ID...17, ID...66, ID...259, ID...266, ID...273 dbl (266): TILEsize,
## TILEshift, nucTILEsize, pearson_NUC_A, pearson_NUC_C, p... lgl (1):
## pearson_RNAnorm_nucREGION_CLIP_CLIP_YB.all
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `ID...15` -> `ID...17`
## • `ID...64` -> `ID...66`
## • `ID...257` -> `ID...259`
## • `ID...264` -> `ID...266`
## • `ID...271` -> `ID...273`
#extract available statistical methods
STATs= RAW %>%
select(contains("TILE"), contains("_NUC_"))%>%
pivot_longer(-contains("TILE"))%>%
separate(name, c("STATtype","NUC","NUCid"), sep = "_", remove = FALSE)%>%
select(STATtype)%>%
unique()%>%
unlist()
#fix certain parameters
currSIZE=100
METHOD="kendall"
BINWIDTH=0.025
#functions to round up or down to the nearest multiple
round_up_to <- function(x, multiple) {
ceiling(x / multiple) * multiple
}
round_down_to <- function(x, multiple) {
floor(x / multiple) * multiple
}
for (currTYPE in c("UTR")){
currTABLE = RAW %>%
filter(TYPE == currTYPE)
{}
#determine the limits for the current filter
LIMITS=currTABLE %>%
select(contains("TILE"), contains("_NUC_"))%>%
pivot_longer(-contains("TILE"))%>%
separate(name, c("STATtype","NUC","NUCid"), sep = "_", remove = FALSE)%>%
filter(STATtype == METHOD & TILEsize==currSIZE )%>%
summarise(minVAL=min(value), maxVAL=max(value))
#round to the nearest binwidth
maxVAL=round_up_to(LIMITS$maxVAL,BINWIDTH)
minVAL=round_down_to(LIMITS$minVAL,BINWIDTH)
BREAKS=seq(minVAL,maxVAL,by=BINWIDTH)
# minVAL=-0.225
# maxVAL=0.45
#loop through all the nucleotides to get individual plots
for (currNUC in c("A","C","G","T")){
#extract the relevant data for the current plot and change T to U
X=currTABLE %>%
# filter(TYPE == currTYPE )%>%
select(contains("TILE"), contains("_NUC_"),TYPE)%>%
pivot_longer(-c(contains("TILE"),TYPE))%>%
separate(name, c("STATtype","NUC","NUCid"), sep = "_", remove = FALSE)%>%
mutate(name = case_when(
str_detect(name, "NUC_T") ~ "NUC_U" ,
TRUE ~ paste(NUC,NUCid,sep="_")
))%>%
filter(TILEsize==currSIZE & NUCid==currNUC & STATtype == METHOD)
MIN=currTABLE%>%
select(starts_with("TILE"),starts_with("nucTILE"), contains("RNAnorm_nucREGION_CLIP_CLIP_Y"),contains("NUC_"))%>%
pivot_longer(-c(starts_with("TILE"),starts_with("nucTILE")))%>%
filter(str_detect(name,METHOD))%>%
select(value)%>%min()*1.1
MAX=currTABLE%>%
select(starts_with("TILE"),starts_with("nucTILE"), contains("RNAnorm_nucREGION_CLIP_CLIP_Y"),contains("NUC_"))%>%
pivot_longer(-c(starts_with("TILE"),starts_with("nucTILE")))%>%
filter(str_detect(name,METHOD))%>%
select(value)%>%max()*1.1
#determine the maximum correlation
maxLINE= X %>%
group_by(TILEsize)%>%
filter(value==max(value))%>%
#round number
mutate(value=round(value,2))
#select the current nucleotide for the plot
X = X %>%
mutate(
value = case_when(
# nucTILEsize==min(nucTILEsize) & TILEshift==min(TILEshift) ~ minVAL,
# nucTILEsize==max(nucTILEsize) & TILEshift==max(TILEshift) ~ maxVAL,
# value < minVAL ~ minVAL,
# value > maxVAL ~ maxVAL,
TRUE ~ value
)
)
#plot the graph
p=X %>%
ggplot()+
# geom_contour_filled(aes(y=nucTILEsize, x=TILEshift, z=value) )+
geom_tile(aes(y=nucTILEsize, x=TILEshift, fill=value) )+
geom_point_rast(data=maxLINE, aes(y=nucTILEsize, x=TILEshift),color="red")+
geom_point_rast(aes(y=750, x=-600),color="red")+
geom_text(data = maxLINE,
aes(y = nucTILEsize, x = TILEshift, label = value),
color = "black",
size = 2,
vjust = 1.5) + # Adjust position vertically
# facet_wrap(~TYPE) +
geom_vline(xintercept = 00, linetype = 3)+
# geom_vline(xintercept = -420, linetype = 2)+
geom_hline(yintercept = 100, linetype = 3)+
labs(title=paste(currTYPE, METHOD, currNUC, sep="_"),
y="window used to evaluate CLIP signal [nt]",
x="start position shift relative to the piRNA tile [nt]"
)+
theme_cowplot(12)+
# guides(fill = guide_legend(ncol = 1))+
theme(
# legend.position = "none",
axis.text = element_text(size=12),
axis.text.x = element_text(angle = 90, hjust = 1),
axis.title = element_text(size=10),
strip.text = element_text(size=10),
panel.grid.minor = element_blank(),
panel.grid.major = element_blank(),
aspect.ratio = 1
)+
scale_fill_scico(limits = c(minVAL, maxVAL),midpoint=0, palette = "vik", direction = 1 )
# scale_fill_scico( palette = "vik", direction = 1 )
print(p)
if(currNUC=="T"){
HEIGHT=100
WIDTH=100
}else{
HEIGHT=100
WIDTH=100
dpi=100
}
ggsave(paste("parameter-sweep.correct_nuc.biogEFF.",METHOD,".",currNUC,".",currTYPE,".pdf",sep=""), p, height = HEIGHT, width = WIDTH, units = "mm", dpi = 300)
}
}
#extract available statistical methods
STATs= RAW %>%
select(contains("TILE"), contains("_diNUC_"))%>%
pivot_longer(-contains("TILE"))%>%
separate(name, c("STATtype","NUC","NUCid"), sep = "_", remove = FALSE)%>%
select(STATtype)%>%
unique()%>%
unlist()
#fix certain parameters
currSIZE=100
METHOD="kendall"
BINWIDTH=0.025
#functions to round up or down to the nearest multiple
round_up_to <- function(x, multiple) {
ceiling(x / multiple) * multiple
}
round_down_to <- function(x, multiple) {
floor(x / multiple) * multiple
}
for (currTYPE in c("UTR")){
currTABLE = RAW %>%
filter(TYPE == currTYPE)
{}
#determine the limits for the current filter
LIMITS=currTABLE %>%
select(contains("TILE"), contains("_diNUC_"))%>%
pivot_longer(-contains("TILE"))%>%
separate(name, c("STATtype","NUC","NUCid"), sep = "_", remove = FALSE)%>%
filter(STATtype == METHOD & TILEsize==currSIZE )%>%
summarise(minVAL=min(value), maxVAL=max(value))
#round to the nearest binwidth
maxVAL=round_up_to(LIMITS$maxVAL,BINWIDTH)
minVAL=round_down_to(LIMITS$minVAL,BINWIDTH)
BREAKS=seq(minVAL,maxVAL,by=BINWIDTH)
p = currTABLE %>%
select(contains("TILE"), contains("_diNUC_"),TYPE)%>%
pivot_longer(-c(contains("TILE"),TYPE))%>%
separate(name, c("STATtype","NUC","NUCid"), sep = "_", remove = FALSE)%>%
filter(TILEsize==currSIZE & STATtype == METHOD)%>%
ggplot()+
geom_tile(aes(y=nucTILEsize, x=TILEshift, fill=value) )+
facet_wrap(~NUCid) +
geom_vline(xintercept = 00, linetype = 3)+
# geom_vline(xintercept = -420, linetype = 2)+
geom_hline(yintercept = 100, linetype = 3)+
labs(title=paste(currTYPE, METHOD, currNUC, sep="_"),
y="window used to evaluate CLIP signal [nt]",
x="start position shift relative to the piRNA tile [nt]"
)+
theme_cowplot(12)+
# guides(fill = guide_legend(ncol = 1))+
theme(
# legend.position = "none",
axis.text = element_text(size=12),
axis.text.x = element_text(angle = 90, hjust = 1),
axis.title = element_text(size=10),
strip.text = element_text(size=10),
panel.grid.minor = element_blank(),
panel.grid.major = element_blank(),
aspect.ratio = 1
)+
# scale_fill_scico(limits = c(MIN, MAX),midpoint=0, palette = "vik", direction = 1 )
scale_fill_scico( palette = "vik", direction = 1 )
ggsave(paste("parameter-sweep.correct_nuc.biogEFF.",METHOD,".diNUC.",currTYPE,".pdf",sep=""), p, height = 600, width = 600, units = "mm", dpi = 300)
#loop through all the nucleotides to get individual plots
for (currNUC in c("TT")){
#extract the relevant data for the current plot and change T to U
X=currTABLE %>%
# filter(TYPE == currTYPE )%>%
select(contains("TILE"), contains("_diNUC_"),TYPE)%>%
pivot_longer(-c(contains("TILE"),TYPE))%>%
separate(name, c("STATtype","NUC","NUCid"), sep = "_", remove = FALSE)%>%
mutate(name = case_when(
str_detect(name, "NUC_TT") ~ "NUC_UU" ,
TRUE ~ paste(NUC,NUCid,sep="_")
))%>%
filter(TILEsize==currSIZE & NUCid==currNUC & STATtype == METHOD)
MIN=currTABLE%>%
select(starts_with("TILE"),starts_with("nucTILE"), contains("RNAnorm_nucREGION_CLIP_CLIP_Y"),contains("NUC_"))%>%
pivot_longer(-c(starts_with("TILE"),starts_with("nucTILE")))%>%
filter(str_detect(name,METHOD))%>%
select(value)%>%min()*1.1
MAX=currTABLE%>%
select(starts_with("TILE"),starts_with("nucTILE"), contains("RNAnorm_nucREGION_CLIP_CLIP_Y"),contains("NUC_"))%>%
pivot_longer(-c(starts_with("TILE"),starts_with("nucTILE")))%>%
filter(str_detect(name,METHOD))%>%
select(value)%>%max()*1.1
#determine the maximum correlation
maxLINE= X %>%
group_by(TILEsize)%>%
filter(value==max(value))%>%
#round number
mutate(value=round(value,2))
#select the current nucleotide for the plot
X = X %>%
mutate(
case_when(
nucTILEsize==min(nucTILEsize) & TILEshift==min(TILEshift) ~ minVAL,
nucTILEsize==max(nucTILEsize) & TILEshift==max(TILEshift) ~ maxVAL,
TRUE ~ value
)
)
#plot the graph
p=X %>%
ggplot()+
# geom_contour_filled(aes(y=nucTILEsize, x=TILEshift, z=value) )+
geom_tile(aes(y=nucTILEsize, x=TILEshift, fill=value) )+
geom_point_rast(data=maxLINE, aes(y=nucTILEsize, x=TILEshift),color="red")+
geom_point_rast(aes(y=750, x=-600),color="red")+
geom_text(data = maxLINE,
aes(y = nucTILEsize, x = TILEshift, label = value),
color = "black",
size = 2,
vjust = 1.5) + # Adjust position vertically
# facet_wrap(~TYPE) +
geom_vline(xintercept = 00, linetype = 3)+
# geom_vline(xintercept = -420, linetype = 2)+
geom_hline(yintercept = 100, linetype = 3)+
labs(title=paste(currTYPE, METHOD, currNUC, sep="_"),
y="window used to evaluate CLIP signal [nt]",
x="start position shift relative to the piRNA tile [nt]"
)+
theme_cowplot(12)+
# guides(fill = guide_legend(ncol = 1))+
theme(
# legend.position = "none",
axis.text = element_text(size=12),
axis.text.x = element_text(angle = 90, hjust = 1),
axis.title = element_text(size=10),
strip.text = element_text(size=10),
panel.grid.minor = element_blank(),
panel.grid.major = element_blank(),
aspect.ratio = 1
)+
# scale_fill_scico(limits = c(MIN, MAX),midpoint=0, palette = "vik", direction = 1 )
scale_fill_scico( palette = "vik", direction = 1 )
print(p)
if(currNUC=="T"){
HEIGHT=100
WIDTH=100
}else{
HEIGHT=100
WIDTH=100
dpi=100
}
ggsave(paste("parameter-sweep.correct_nuc.biogEFF.",METHOD,".",currNUC,".",currTYPE,".pdf",sep=""), p, height = HEIGHT, width = WIDTH, units = "mm", dpi = 300)
}
}
############################################################################################### ############################################################################################### ############################################################################################### # ########################### biogenesis-efficiency comparison ############################################### ############################################################################################### ############################################################################################### ###############################################################################################
# load data
RAW = read_tsv("full-length.quantified.ALL.txt", col_names = TRUE)%>%
#remove lost YB library
select(-contains("243686"))%>%
{}
## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
## dat <- vroom(...)
## problems(dat)
## Rows: 11954 Columns: 602
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (6): CHR, FBgn, TYPE, STRAND, BLOCKSTARTS, fullLengthUNIQ
## dbl (594): START, STOP, X, thickSTART, thickSTOP, nBLOCKS, RNAseq_RPKM, TTse...
## num (2): COLOR, BLOCKSIZES
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# filter data
TABLEfilt= RAW %>%
filter(TYPE %in% c("UTR","CDS", "CLUSTER")) %>%
filter(!str_detect(FBgn, "GL")) %>%
select(-contains("ARMI"))%>%
mutate(LENGTHds500=LENGTH-500)%>%
mutate(NUC_GC = fullLENGTH_NUC_C + fullLENGTH_NUC_G)%>%
#filters implemented for the study
filter(UNIQ > 50, RNAseq_RPKM>5 ) %>%
{}
nLINES=nrow(TABLEfilt)
#calculate RNAseq/TTseq corrected smallRNA and CLIP counts
TABLEfilt <- TABLEfilt %>%
mutate(
across(
starts_with(c("sRNAdata_")),
~.x / TTseq_RPKM,
.names = "TTnorm_{.col}"
),
across(
starts_with(c("sRNAdata_")),
~.x / RNAseq_RPKM,
.names = "RNAnorm_{.col}"
),
)
PLOTtable = TABLEfilt %>%
select(FBgn, TYPE, `TTnorm_sRNAdata_average-WT`, `TTnorm_sRNAdata_average-YB`)%>%
rename( "TT_WT" = `TTnorm_sRNAdata_average-WT`, "TT_YB"=`TTnorm_sRNAdata_average-YB`)%>%
separate(FBgn, c("CLUSTER","PosInCluster"), sep = "-", remove = FALSE)%>%
mutate(
CLUSTER = case_when(
TYPE == "CLUSTER" ~ CLUSTER,
TRUE ~ "other-source-loci"
),
dotSIZE = case_when(
TYPE == "CLUSTER" ~ 1.5,
TRUE ~ 0.01
)
)
# First, get the unique clusters excluding "other-source-loci"
n_clusters <- length(unique(PLOTtable$CLUSTER[PLOTtable$CLUSTER != "other-source-loci"]))
# Create the color palette
okabe_ito <- as.character(paletteer_d("colorblindr::OkabeIto"))
cluster_colors <- c(okabe_ito[1:n_clusters], "black")
PLOTtable$TYPE <- factor(PLOTtable$TYPE,
levels = c("CDS", "UTR", "CLUSTER"))
p= PLOTtable %>%
pivot_longer(cols = c(TT_WT, TT_YB), names_to = "NORMtype", values_to = "COUNT")%>%
ggplot(aes(x=TYPE,y=COUNT,colour = TYPE))+
# geom_point()+
geom_quasirandom_rast( aes(size=dotSIZE), alpha=1, varwidth = TRUE, groupOnX = TRUE)+
scale_y_log10(limits=c(0.001,300))+
# scale_color_igv()+
scale_color_manual(values = c("CDS" = "#0274b3", "UTR" = "#343991" , "CLUSTER" = "#e6a025" )) + # Add this line
scale_size_identity()+
stat_summary(
fun = median,
geom = "crossbar",
width = 0.5,
fatten = 1.5,
color = "red"
) +
stat_summary(
fun = median,
geom = "text",
aes(label = sprintf("%.2f", after_stat(y))),
vjust = -0.5,
size = 3,
color = "red"
)+ coord_cartesian(clip = "off")+
annotation_logticks(sides = "l", outside=FALSE) +
facet_row(~NORMtype)+
theme_cowplot(12)+
theme(
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
strip.background = element_blank(),
legend.position = "none"
)+
{}
print(p)
ggsave("biogenesis-efficiency.TTseq.pdf", p, width = 3, height = 5, units = "in", dpi = 300)
PLOTtable = TABLEfilt %>%
select(FBgn, TYPE, `RNAnorm_sRNAdata_average-WT`,`RNAnorm_sRNAdata_average-YB`)%>%
rename( "RNA_WT" = `RNAnorm_sRNAdata_average-WT`, "RNA_YB"=`RNAnorm_sRNAdata_average-YB`)%>%
separate(FBgn, c("CLUSTER","PosInCluster"), sep = "-", remove = FALSE)%>%
mutate(
CLUSTER = case_when(
TYPE == "CLUSTER" ~ CLUSTER,
TRUE ~ "other-source-loci"
),
dotSIZE = case_when(
TYPE == "CLUSTER" ~ 1.5,
TRUE ~ 0.01
)
)
# First, get the unique clusters excluding "other-source-loci"
n_clusters <- length(unique(PLOTtable$CLUSTER[PLOTtable$CLUSTER != "other-source-loci"]))
# Create the color palette
okabe_ito <- as.character(paletteer_d("colorblindr::OkabeIto"))
cluster_colors <- c(okabe_ito[1:n_clusters], "black")
PLOTtable$TYPE <- factor(PLOTtable$TYPE,
levels = c("CDS", "UTR", "CLUSTER"))
p= PLOTtable %>%
pivot_longer(cols = c(RNA_WT, RNA_YB), names_to = "NORMtype", values_to = "COUNT")%>%
ggplot(aes(x=TYPE,y=COUNT,colour = TYPE))+
geom_quasirandom( aes(size=dotSIZE), alpha=1, varwidth = TRUE, groupOnX = TRUE)+
scale_y_log10(limits=c(0.001,300))+
# scale_color_igv()+
stat_summary(
fun = median,
geom = "crossbar",
width = 0.5,
fatten = 1.5,
color = "red"
) +
stat_summary(
fun = median,
geom = "text",
aes(label = sprintf("%.2f", after_stat(y))),
vjust = -0.5,
size = 3,
color = "red"
)+
scale_color_manual(values = c("CDS" = "#0274b3", "UTR" = "#343991" , "CLUSTER" = "#e6a025" )) + # Add this line
scale_size_identity()+
facet_row(~NORMtype)+
theme_cowplot(12)+
theme(
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
strip.background = element_blank()
)+
{}
print(p)
ggsave("biogenesis-efficiency.RNAseq.pdf", p, width = 3.8, height = 5, units = "in", dpi = 300)
############################################################################################### ############################################################################################### ############################################################################################### # ########################### plot data in YB-depleted condition ############################################### ############################################################################################### ############################################################################################### ############################################################################################### # model on siYB tiles
# load data
RAW = read_tsv("YBdep_tiles.100_0_100.txt", col_names = TRUE)%>%
# RAW = read_tsv("tiles.100_-600_750_quantified.allLIBs.txt", col_names = TRUE)%>%
#remove lost YB library
select(-contains("243686"))%>%
mutate(
TYPE = case_when(
grepl("CDS", CHR) ~ "CDS",
grepl("UTR", CHR) ~ "UTR",
grepl("CLUSTER", CHR) ~ "CLUSTER",
TRUE ~ "OTHER"
),
)
## Rows: 84980 Columns: 111
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (3): CHR, FBgn, STRAND
## dbl (108): START, STOP, X, thickSTART, thickSTOP, nPOS, RNAseq_RPKM, TTseq_R...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
{}
## NULL
# filter data
TABLEfilt= RAW %>%
filter(TYPE == "UTR" )%>%
dplyr::select(-contains("ARMI"))%>%
#filters implemented for the study
filter(UNIQ > 50, RNAseq_RPKM>5,TTseq_RPKM>5, nPOS>0, UTR_LENGTH>700 ) %>%
{}
nLINES=nrow(TABLEfilt)
#calculate RNAseq/TTseq corrected smallRNA and CLIP counts
TABLEfilt <- TABLEfilt %>%
mutate(
across(
starts_with(c("sRNAdata_")),
~.x / TTseq_RPKM ,
.names = "TTnorm_{.col}"
),
across(
starts_with(c("sRNAdata_")),
~.x / RNAseq_RPKM,
.names = "RNAnorm_{.col}"
),
across(
starts_with(c("CLIP_")),
~.x / RNAseq_RPKM,
.names = "RNAnorm_{.col}"
)
)
plotTABLEorig = TABLEfilt %>%
select(-contains("sRNAdata_average-ARMI"))%>%
rename(TTnorm_sRNAdata_average_YB = `TTnorm_sRNAdata_average-YB`)%>%
rename(RNAnorm_sRNAdata_average_YB = `RNAnorm_sRNAdata_average-YB`)%>%
rename(TTnorm_sRNAdata_average_WT = `TTnorm_sRNAdata_average-WT`)%>%
rename(RNAnorm_sRNAdata_average_WT = `RNAnorm_sRNAdata_average-WT`)%>%
filter(!is.na(TTnorm_sRNAdata_average_YB),TTnorm_sRNAdata_average_YB> 0.01)%>%
mutate(
BIN2 = as.factor(cut_interval(log10(TTnorm_sRNAdata_average_YB), n=4, labels = FALSE))
)%>%
#rename column `TTnorm_sRNAdata_average-WT` to TTnorm_sRNAdata_average_WT
{}
# Get bin breakpoints
bin_breaks <- plotTABLEorig %>%
group_by(BIN2) %>%
summarise(min_YBdep = min(TTnorm_sRNAdata_average_YB, na.rm = TRUE)) %>%
pull(min_YBdep)
# Remove first bin (we don’t need a line at the lowest boundary)
bin_breaks <- sort(bin_breaks[-1])
#determine number per bin
count_data <- plotTABLEorig %>%
group_by(BIN2) %>%
summarise(n = n())
# Set seed for reproducible jittering
set.seed(123)
p = plotTABLEorig %>%
filter(nPOS>0)%>%
mutate(
HIGHLIGHT = case_when(
FBgn == "FBgn0086359" ~ "OK",
FBgn == "FBgn0262656" ~ "OK",
FBgn == "FBgn0260748" ~ "OK",
TRUE ~ "NO"
),
label = case_when(
HIGHLIGHT == "OK" ~ FBgn,
TRUE ~ NA_character_
)
) %>%
mutate(HIGHLIGHT = factor(HIGHLIGHT, levels = c("NO", "OK"))) %>%
select(FBgn, TTnorm_sRNAdata_average_YB, TTnorm_sRNAdata_average_WT,RNAnorm_sRNAdata_average_WT,RNAnorm_sRNAdata_average_YB, HIGHLIGHT, label) %>%
pivot_longer(-c(FBgn, HIGHLIGHT, label), names_to = "name", values_to = "value") %>%
mutate(
normTYPE = case_when(
grepl("TTnorm", name) ~ "TTnorm",
grepl("RNAnorm", name) ~ "RNAnorm",
TRUE ~ "Unknown"
),
name = case_when(
str_detect(name, "WT") ~ "WT",
str_detect(name, "YB") ~ "YB"
)
)%>%
ggplot(aes(x = name, y = value)) +
geom_quasirandom_rast(aes(color = HIGHLIGHT),
size = 0.2,
width = 0.4,
groupOnX = FALSE) +
geom_hline(yintercept = bin_breaks, linetype = "dashed", color = "red", size = 1) +
# annotate("text", x = 1.4, y = bin_breaks,
# label = round(bin_breaks, 3), hjust = 1, vjust = 2, color = "red") +
scale_color_manual(values = c("NO" = "black", "OK" = "red")) +
scale_y_log10(breaks = scales::log_breaks()) +
annotation_logticks(sides = "l", outside = TRUE) +
facet_row(~normTYPE) +
coord_cartesian(clip = "off") +
# Extend the plot area to make room for labels
# coord_cartesian(clip = "off", xlim = c(0.5, 1.5)) +
labs(x = "for-size", y = "Biogenesis Efficiench") +
theme_cowplot(14) +
theme(
axis.text.x = element_text(margin = margin(t = 9, unit = "pt")),
axis.text.y = element_text(margin = margin(r = 9, unit = "pt")),
panel.grid.minor = element_blank(),
panel.grid.major = element_blank(),
legend.position = "none",
# Add more right margin for labels if needed
plot.margin = margin(5.5, 40, 5.5, 5.5, "pt")
)
p
## Warning in scale_y_log10(breaks = scales::log_breaks()): log-10 transformation
## introduced infinite values.
## Warning: Removed 20 rows containing missing values or values outside the scale range
## (`geom_point()`).
ggsave2("siYB_100nt-tiles_BiogEF.violin.pdf", p, width = 5, height = 5)
## Warning in scale_y_log10(breaks = scales::log_breaks()): log-10 transformation introduced infinite values.
## Removed 20 rows containing missing values or values outside the scale range
## (`geom_point()`).
plotTABLEnuc2 = plotTABLEorig %>%
select(FBgn, BIN2, starts_with("NUC_")) %>%
pivot_longer(-c(FBgn, BIN2), names_to = "name", values_to = "value")%>%
separate(name, c("TYPE", "name"), sep = "_")%>%
group_by(name, BIN2)
count_data <- plotTABLEnuc2 %>%
summarise(n = n())
## `summarise()` has grouped output by 'name'. You can override using the
## `.groups` argument.
# Compute maximum y-value per BIN2 + small offset
max_values <- plotTABLEnuc2 %>%
summarise(y_max = max(value, na.rm = TRUE) + 2) # Increase by 10%
## `summarise()` has grouped output by 'name'. You can override using the
## `.groups` argument.
# Define group comparisons
comparisons <- list(c("1", "2"), c("2", "3"), c("3", "4"))
# Merge y_max values for p-value label positions
y_positions <- max_values$y_max[-c(1, 5,9,13)] # Adjust y-positions for p-values
p2 = plotTABLEnuc2 %>%
ggplot(aes(x = BIN2, y = value)) +
# geom_quasirandom_rast(size = 0.1, alpha = 1, linewidth = 0.3, draw_quantiles = c(0.5)) +
geom_violin_rast(size = 0.1, alpha = 1, linewidth = 0.3, fill="black") +
# geom_boxplot(fill="black") +
facet_wrap(~name, nrow=1,) +
# Add count labels
geom_text(data = count_data, aes(x = BIN2, y = 8, label = n),
color = "black", size = 3, vjust = 0) +
coord_cartesian(ylim=c(5,60)) +
# Add median crossbars
stat_summary(
fun = median,
geom = "crossbar",
width = 0.8,
fatten = 3,
color = "red"
) +
# Add p-values dynamically above max y-values
stat_compare_means(comparisons = comparisons,
method = "wilcox.test",
# label = "p.signif",label.y = y_positions) + # Use computed y-positions
label = "p.signif",label.y = 45) + # Use computed y-positions
labs(y = "[%] Nucleotide", x= "YB-dependency bins") +
theme_cowplot(14) +
theme(
axis.text.x = element_text(margin = margin(t = 9, unit = "pt")),
axis.text.y = element_text(margin = margin(r = 9, unit = "pt")),
panel.grid.minor = element_blank(),
panel.grid.major = element_blank(),
legend.position = "none",
panel.spacing = unit(1, "lines"),
# Black border on facet strips
strip.background = element_rect(fill = "white", color = "black", size = 1),
strip.text = element_text(face = "bold", color = "black")
)
## Warning: The `size` argument of `element_rect()` is deprecated as of ggplot2 3.4.0.
## ℹ Please use the `linewidth` argument instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
p2
ggsave2("siYB_100nt-tiles_BiogEF.violin-nuc.pdf", p2, width = 6, height = 4)
plotTABLEorig = TABLEfilt %>%
select(-contains("sRNAdata_average-ARMI"))%>%
rename(TTnorm_sRNAdata_average_YB = `TTnorm_sRNAdata_average-YB`)%>%
rename(RNAnorm_sRNAdata_average_YB = `RNAnorm_sRNAdata_average-YB`)%>%
rename(TTnorm_sRNAdata_average_WT = `TTnorm_sRNAdata_average-WT`)%>%
rename(RNAnorm_sRNAdata_average_WT = `RNAnorm_sRNAdata_average-WT`)%>%
filter(!is.na(TTnorm_sRNAdata_average_YB),TTnorm_sRNAdata_average_YB> 0.01)%>%
mutate(
BIN2 = as.factor(cut_interval(log10(TTnorm_sRNAdata_average_YB), n=4, labels = FALSE))
)%>%
#rename column `TTnorm_sRNAdata_average-WT` to TTnorm_sRNAdata_average_WT
filter(nPOS> 6, ) %>%
{}
# Get bin breakpoints
bin_breaks <- plotTABLEorig %>%
group_by(BIN2) %>%
summarise(min_YBdep = min(TTnorm_sRNAdata_average_YB, na.rm = TRUE)) %>%
pull(min_YBdep)
# Remove first bin (we don’t need a line at the lowest boundary)
bin_breaks <- sort(bin_breaks[-1])
#determine number per bin
count_data <- plotTABLEorig %>%
group_by(BIN2) %>%
summarise(n = n())
# Set seed for reproducible jittering
set.seed(123)
p = plotTABLEorig %>%
filter(nPOS>0)%>%
mutate(
HIGHLIGHT = case_when(
FBgn == "FBgn0086359" ~ "OK",
FBgn == "FBgn0262656" ~ "OK",
FBgn == "FBgn0260748" ~ "OK",
TRUE ~ "NO"
),
label = case_when(
HIGHLIGHT == "OK" ~ FBgn,
TRUE ~ NA_character_
)
) %>%
mutate(HIGHLIGHT = factor(HIGHLIGHT, levels = c("NO", "OK"))) %>%
select(FBgn, TTnorm_sRNAdata_average_YB, TTnorm_sRNAdata_average_WT,RNAnorm_sRNAdata_average_WT,RNAnorm_sRNAdata_average_YB, HIGHLIGHT, label) %>%
pivot_longer(-c(FBgn, HIGHLIGHT, label), names_to = "name", values_to = "value") %>%
mutate(
normTYPE = case_when(
grepl("TTnorm", name) ~ "TTnorm",
grepl("RNAnorm", name) ~ "RNAnorm",
TRUE ~ "Unknown"
),
name = case_when(
str_detect(name, "WT") ~ "WT",
str_detect(name, "YB") ~ "YB"
)
)%>%
ggplot(aes(x = name, y = value)) +
geom_quasirandom_rast(aes(color = HIGHLIGHT),
size = 0.2,
width = 0.4,
groupOnX = FALSE) +
geom_hline(yintercept = bin_breaks, linetype = "dashed", color = "red", size = 1) +
# annotate("text", x = 1.4, y = bin_breaks,
# label = round(bin_breaks, 3), hjust = 1, vjust = 2, color = "red") +
scale_color_manual(values = c("NO" = "black", "OK" = "red")) +
scale_y_log10(breaks = scales::log_breaks()) +
annotation_logticks(sides = "l", outside = TRUE) +
facet_row(~normTYPE) +
coord_cartesian(clip = "off") +
# Extend the plot area to make room for labels
# coord_cartesian(clip = "off", xlim = c(0.5, 1.5)) +
labs(x = "for-size", y = "Biogenesis Efficiench") +
theme_cowplot(14) +
theme(
axis.text.x = element_text(margin = margin(t = 9, unit = "pt")),
axis.text.y = element_text(margin = margin(r = 9, unit = "pt")),
panel.grid.minor = element_blank(),
panel.grid.major = element_blank(),
legend.position = "none",
# Add more right margin for labels if needed
plot.margin = margin(5.5, 40, 5.5, 5.5, "pt")
)
p
ggsave2("siYB_100nt-tiles_BiogEF.violin.ds600.pdf", p, width = 5, height = 5)
p=plotTABLEorig %>%
# filter(TTnorm_sRNAdata_average_YB <0.5) %>%
ggplot(aes(x=TTnorm_sRNAdata_average_YB,y=TTseq_RPKM, text=CHR))+
geom_point()+
scale_y_log10()+
scale_x_log10()
ggplotly(p, tooltip = c("text", "TTnorm_sRNAdata_average_YB", "TTseq_RPKM"))
plotTABLE = TABLEfilt %>%
select(FBgn, nPOS, TTseq_RPKM, RNAseq_RPKM, contains("sRNAdata_average-YB"), contains("sRNAdata_average-WT")) %>%
separate(FBgn, c("ID"), sep = ":!:", remove = FALSE, convert = TRUE) %>%
filter(nPOS<=40)%>%
group_by(ID) %>%
mutate(
scaledLEVEL = `sRNAdata_average-YB` / max(`sRNAdata_average-YB`, na.rm = TRUE )
)
## Warning: Expected 1 pieces. Additional pieces discarded in 14419 rows [1, 2, 3, 4, 5, 6,
## 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, ...].
# Create a summary table with counts
count_table <- plotTABLE %>%
group_by(nPOS) %>%
summarise(n = n(), .groups = 'drop')
p = plotTABLE %>%
ggplot(aes(x=as.factor(nPOS), y=`TTnorm_sRNAdata_average-YB`))+
geom_boxplot(aes(x=as.factor(nPOS)), outlier.size = 0.1 , linewidth=0.2)+
geom_smooth(aes(x=nPOS) )+
scale_y_log10()+
# scale_y_continuous(limits = c(-0.1, 1)) +
geom_text(data = count_table,
aes(x = as.factor(nPOS), y = 0.01, label = paste("n =", n)),
size = 3, angle = 90, hjust = 1) +
theme_cowplot(14) +
theme(
legend.position = "none"
)
print(p)
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
ggsave("siYB_100nt-tiles_ramp-up.biogEFF.pdf", p, width = 6, height = 4, dpi = 300)
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
plotTABLE = TABLEfilt %>%
select(FBgn, nPOS, starts_with("NUC_")) %>%
separate(FBgn, c("ID"), sep = ":!:", remove = FALSE, convert = TRUE) %>%
filter(nPOS<=40)%>%
group_by(ID) %>%
{}
## Warning: Expected 1 pieces. Additional pieces discarded in 14419 rows [1, 2, 3, 4, 5, 6,
## 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, ...].
# Create a summary table with counts
count_table <- plotTABLE %>%
group_by(nPOS) %>%
summarise(n = n(), .groups = 'drop')
n_clusters=3
okabe_ito <- as.character(paletteer_d("colorblindr::OkabeIto"))
cluster_colors <- c(okabe_ito[1:n_clusters], "black")
p = plotTABLE %>%
ungroup()%>%
pivot_longer(- c(FBgn,ID, nPOS), names_to = "name", values_to = "value") %>%
ggplot(aes(x=as.factor(nPOS), y=value, color=name))+
geom_boxplot(aes(x=as.factor(nPOS)), outlier.size = 0.1 , linewidth=0.2)+
geom_smooth(aes(x=nPOS))+
scale_y_log10()+
# scale_y_continuous(limits = c(-0.1, 1)) +
# geom_text(data = count_table,
# aes(x = as.factor(nPOS), y = 0.01, label = paste("n =", n)),
# size = 3, angle = 90, hjust = 1) +
theme_cowplot(14) +
theme(
legend.position = "bottom"
)+
scale_color_manual(values = cluster_colors)
print(p)
## Warning in scale_y_log10(): log-10 transformation introduced infinite values.
## Warning in scale_y_log10(): log-10 transformation introduced infinite values.
## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_boxplot()`).
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_smooth()`).
############################################################################################### ############################################################################################### ############################################################################################### # ########################### model on siYB tiles - full UTR ############################################### ############################################################################################### ############################################################################################### ############################################################################################### # model on siYB tiles
# load data
RAW = read_tsv("YBdep_tiles.100_0_100.txt", col_names = TRUE)%>%
# RAW = read_tsv("tiles.100_-600_750_quantified.allLIBs.txt", col_names = TRUE)%>%
#remove lost YB library
select(-contains("243686"))%>%
mutate(
TYPE = case_when(
grepl("CDS", CHR) ~ "CDS",
grepl("UTR", CHR) ~ "UTR",
grepl("CLUSTER", CHR) ~ "CLUSTER",
TRUE ~ "OTHER"
),
)
## Rows: 84980 Columns: 111
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (3): CHR, FBgn, STRAND
## dbl (108): START, STOP, X, thickSTART, thickSTOP, nPOS, RNAseq_RPKM, TTseq_R...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
{}
## NULL
NUCclass="NUC_"
# filter data
TABLEfilt= RAW %>%
filter(TYPE == "UTR" )%>%
filter(UNIQ > 50, RNAseq_RPKM>1,TTseq_RPKM>1, nPOS>0, UTR_LENGTH>800 ) %>%
select(-`sRNAdata_average-YB_CLUSTERuniq`)%>%
{}
#calculate YB-dependency
TABLEfilt <- TABLEfilt %>%
mutate(
sRNAdata_average_YB = log10(`sRNAdata_average-YB`),
)%>%
filter(!is.na(sRNAdata_average_YB),!is.na(sRNAdata_average_YB),!is.infinite(sRNAdata_average_YB))%>%
{}
# Calculate percentiles
percentiles <- TABLEfilt %>%
select(FBgn,TYPE,contains("sRNAdata_average-YB")) %>%
pivot_longer(-c(FBgn,TYPE), names_to = "name", values_to = "value", names_ptypes = list(name = factor())) %>%
mutate(name = fct_inorder(name)) %>%
group_by(TYPE,name) %>%
summarise(q10 = quantile(value, 0.05),q25 = quantile(value, 0.25),q75 = quantile(value, 0.75), q90 = quantile(value, 0.95))
## `summarise()` has grouped output by 'TYPE'. You can override using the
## `.groups` argument.
# Join percentiles to original data
data <- TABLEfilt %>%
select(FBgn,TYPE,contains("sRNAdata_average-YB")) %>%
pivot_longer(-c(FBgn,TYPE), names_to = "name", values_to = "value", names_ptypes = list(name = factor())) %>%
mutate(name = fct_inorder(name)) %>%
left_join(percentiles, by = c("name", "TYPE"))
nLINES=nrow(data)
# Plot YB-depleted spread - length normalized
COLORS=paletteer_d("ggthemes::Winter")
p=TABLEfilt %>%
select(FBgn, TYPE, contains("sRNAdata_average-YB"),-contains("RAW"))%>%
pivot_longer(-c(FBgn,TYPE))%>%
ggplot(aes(x=name,y=value))+
geom_violin(size=0.1,alpha=0.1, fill="grey")+
annotate("text", x = Inf, y = Inf,
label = paste("n=",nLINES,sep=""),hjust = 1, vjust = 1)+
scale_y_log10()+
ylab("sRNA reads per 1000 nt")+
geom_hline(yintercept = 12.2, linetype="dashed")+
geom_hline(yintercept = 346, linetype="dashed")+
theme_bw()+
theme(
aspect.ratio = 1,
panel.grid.minor = element_blank(),
panel.grid.major = element_blank(),
legend.position = "none",
axis.text = element_text(size=12),
axis.title = element_text(size=14)
)
print(p)
# ggsave(p, filename = "YB-depleted-spread.pdf", width = 6, height = 8)
# Load the library
# Start the H2O cluster
# h2o.no_progress()
h2o.init(max_mem_size = "20g", nthreads = -1)
## Connection successful!
##
## R is connected to the H2O cluster:
## H2O cluster uptime: 1 days 13 hours
## H2O cluster timezone: Europe/Vienna
## H2O data parsing timezone: UTC
## H2O cluster version: 3.46.0.7
## H2O cluster version age: 10 months and 12 days
## H2O cluster name: H2O_started_from_R_dominik.handler_fuo893
## H2O cluster total nodes: 1
## H2O cluster total memory: 17.66 GB
## H2O cluster total cores: 11
## H2O cluster allowed cores: 11
## H2O cluster healthy: TRUE
## H2O Connection ip: localhost
## H2O Connection port: 54321
## H2O Connection proxy: NA
## H2O Internal Security: FALSE
## R Version: R version 4.3.2 (2023-10-31)
## Warning in h2o.clusterInfo():
## Your H2O cluster version is (10 months and 12 days) old. There may be a newer version available.
## Please download and install the latest version from: https://h2o-release.s3.amazonaws.com/h2o/latest_stable.html
h2o.removeAll()
#random splitting
VARI="sRNAdata_average_YB"
quantiles <- quantile(TABLEfilt$RNAseq_RPKM, probs = c(0.75, 0.90, 0.95))
TABLEfilt$WEIGHT <- ifelse(TABLEfilt$RNAseq_RPKM >= quantiles[3], 10, # Top 5%: weight = 10
ifelse(TABLEfilt$RNAseq_RPKM >= quantiles[2], 5, # Top 10%: weight = 5
ifelse(TABLEfilt$RNAseq_RPKM >= quantiles[1], 2, # Top 25%: weight = 2
1))) # Others: weight = 1
scaled_RNA <- (TABLEfilt$`sRNAdata_average-YB` - min(TABLEfilt$`sRNAdata_average-YB`)) /
(max(TABLEfilt$`sRNAdata_average-YB`) - min(TABLEfilt$`sRNAdata_average-YB`))*1.4
TABLEfilt$WEIGHT = 100^scaled_RNA
max_smallRNA <- max(TABLEfilt$`sRNAdata_average-YB`)
TABLEfilt$WEIGHT <- 100^(1/((max_smallRNA + 1) / (TABLEfilt$`sRNAdata_average-YB` + 1)))
p=TABLEfilt %>%
ggplot(aes(x=`sRNAdata_average-YB`, y=WEIGHT, text=FBgn)) +
geom_point()
print(p)
#splitting keeping genes together
TABLEmodel = TABLEfilt%>%
filter(TYPE=="UTR",nPOS>0)%>%
select(FBgn, VARI,starts_with(NUCclass), TTseq_RPKM,CHR,RNAseq_RPKM,WEIGHT, UTR_LENGTH, nPOS)%>%
# select(FBgn, VARI, starts_with(NUCclass), TTseq_RPKM,CHR,RNAseq_RPKM,WEIGHT )%>%
{}
set.seed(1238575) # reproducible
unique_chr <- unique(TABLEmodel$CHR)
train_chr <- sample(unique_chr,
size = floor(0.7 * length(unique_chr))) # ≈ 75 % of chromosomes
residual_chr <- setdiff(unique_chr, train_chr) # Remaining chromosomes
val_chr <- sample(residual_chr,
size = floor(0.5 * length(residual_chr))) # ≈ 75 % of chromosomes
test_chr <- setdiff(residual_chr, val_chr)
# --- 3. Split the rows on those chromosome sets -------------------------
train_df <- TABLEmodel %>% filter(CHR %in% train_chr)
val_df <- TABLEmodel %>% filter(CHR %in% val_chr)
test_df <- TABLEmodel %>% filter(CHR %in% test_chr)
# --- 4. Drop CHR (if you don’t want it as a feature) and send to H2O ----
train <- as.h2o(train_df %>% select(-CHR), destination_frame = "train.hex")
## | | | 0% | |======================================================================| 100%
val <- as.h2o(val_df %>% select(-CHR), destination_frame = "val.hex")
## | | | 0% | |======================================================================| 100%
test <- as.h2o(test_df %>% select(-CHR), destination_frame = "test.hex")
## | | | 0% | |======================================================================| 100%
# --- 5. (Optional) inspect sizes ----------------------------------------
n_train <- nrow(train)
n_val <- nrow(val)
n_test <- nrow(test)
percent_train = round(n_train *100 / sum(n_train, n_val, n_test),2)
percent_val = round(n_val *100 / sum(n_train, n_val, n_test),2)
percent_test = round(n_test *100 / sum(n_train, n_val, n_test),2)
h2o.dim(train)
## [1] 9468 11
h2o.dim(val)
## [1] 1895 11
h2o.dim(test)
## [1] 1989 11
a=as.data.frame(train) %>%
ggplot(aes(x=!!sym(VARI)))+
geom_histogram()+
# scale_x_log10(limits=c(1,1000))+
annotate("text", x = 0, y = 200, label =paste("n=",n_train,"\n[%]=",percent_train,sep=""))
b=as.data.frame(test)%>%
ggplot(aes(x=!!sym(VARI)))+
geom_histogram()+
# scale_x_log10(limits=c(1,1000))+
annotate("text", x = 0, y = 200, label =paste("n=",n_test,"\n[%]=",percent_test,sep=""))
c=as.data.frame(val) %>%
ggplot(aes(x=!!sym(VARI)))+
geom_histogram()+
# scale_x_log10(limits=c(1,1000))+
annotate("text", x = 0, y = 200, label =paste("n=",n_val,"\n[%]=",percent_val,sep=""))
# print( ggarrange( a,b,c, ncol=1))
PLOT=ggarrange( a,b,c, ncol=1)
print(PLOT)
ggsave(PLOT, filename = "split-distribution.pdf", width = 6, height = 8)
# Define the four scenarios
scenarios <- list(
all_vari = list(name = "Base Model with ALL Variables",
exclude = c()),
no_nuc = list(name = "Without Nucleotide_Content",
exclude = c("NUC_","diNUC_", "triNUC_", "tetraNUC_")),
no_rna = list(name = "Without RNAseq",
exclude = c("RNAseq_RPKM")),
no_tt = list(name = "Without TTseq",
exclude = c("TTseq_RPKM")),
no_rna_tt = list(name = "Without RNAseq and TTseq",
exclude = c("RNAseq_RPKM", "TTseq_RPKM") ),
no_pos = list(name = "Without Positional information",
exclude = c("nPOS"))
)
# Store results
results_list <- list()
plot_list <- list()
# Loop through each scenario
for (scenario_name in names(scenarios)) {
cat("\n\n========================================\n")
cat("Training model:", scenarios[[scenario_name]]$name, "\n")
cat("========================================\n\n")
# Define variables
y <- VARI
x <- setdiff(names(train), y)
# x <- x[!grepl("nPOS", x)]
# x <- x[!grepl("UTR_LENGTH", x)]
# Remove FBgn and WEIGHT
x <- x[!grepl("FBgn", x)]
# Remove features based on scenario
for (pattern in scenarios[[scenario_name]]$exclude) {
x <- x[!grepl(pattern, x)]
}
cat("Features used:", length(x), "\n")
cat("Features:", paste(x, collapse = ", "), "\n\n")
#if no saved model esixts train new one
if (file.exists(paste0("model_ablation_study_",scenario_name))) {
yb_ml <- h2o.loadModel(paste0(WD,"/model_ablation_study_",scenario_name))
cat("Loaded existing model for scenario:", scenario_name, "\n\n")
next
}else{
# Train model
yb_ml_scenario <- h2o.automl(
x = x, y = y,
training_frame = train,
max_models = 30,
seed = 3242,
nfolds = 0,
validation_frame = val,
weights_column = "WEIGHT",
leaderboard_frame = test,
include_algos = c("GBM"),
stopping_metric = "RMSE",
sort_metric = "RMSE"
)
# Get leaderboard and determine best model by CCC
leaderboard <- as.data.frame(yb_ml_scenario@leaderboard)
ccc_vec <- sapply(leaderboard$model_id, function(mid) {
model <- h2o.getModel(mid)
pred <- h2o.predict(model, test)
actual <- 10^as.vector(test[[y]])
predicted <- 10^as.vector(pred)
ccc_value <- CCC(predicted, actual)
ccc_value <- round(unlist(ccc_value[1])[1], 5)
ccc_value
})
leaderboard$CCC <- ccc_vec
leaderboard <- leaderboard[order(-leaderboard$CCC), ]
LEADER <- leaderboard[1, 1]
yb_ml <- h2o.getModel(LEADER)
#safe model
h2o.saveModel(filename=paste0("model_ablation_study_",scenario_name),object = yb_ml, path = getwd(), force = TRUE)
}
# Store model and leaderboard
results_list[[scenario_name]] <- list(
model = yb_ml
)
# Variable importance plot
varimp_data <- h2o.varimp(yb_ml)
p_varimp <- varimp_data %>%
ggplot(aes(x = reorder(variable, scaled_importance), y = scaled_importance)) +
geom_col() +
coord_flip() +
ggtitle(scenarios[[scenario_name]]$name) +
theme_bw() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
axis.text = element_text(size = 12)
)
print(p_varimp)
ggsave(p_varimp, filename = paste0("variable-importance-", scenario_name, ".pdf"), width = 6, height = 6, dpi = 300)
# Predictions on test set (log scale)
pred <- h2o.predict(yb_ml, test)
pred_dataFIN <- as_tibble(test) %>%
bind_cols(PRED = as.vector(pred))
correlation <- lm(pred_dataFIN$PRED ~ pred_dataFIN$sRNAdata_average_YB) %>%
summary() %>% .$r.squared
CCC_value <- CCC(pred_dataFIN$sRNAdata_average_YB, pred_dataFIN$PRED)
PEAR <- cor.test(pred_dataFIN$sRNAdata_average_YB, pred_dataFIN$PRED, method = "pearson")$estimate
SPEAR <- SpearmanRho(pred_dataFIN$sRNAdata_average_YB, pred_dataFIN$PRED)
p_log <- pred_dataFIN %>%
ggplot(aes(x = sRNAdata_average_YB, y = PRED)) +
geom_point_rast(size = 0.5, alpha = 0.3) +
geom_density_2d(color = "red", alpha = 0.5) +
geom_smooth(method = "lm") +
geom_abline(intercept = 0, slope = 1) +
scale_x_continuous(limits = c(-1, 3.5)) +
scale_y_continuous(limits = c(-1, 3.5)) +
annotation_logticks(sides = "lb", outside = FALSE) +
coord_cartesian(clip = "off") +
annotate("text", x = -1, y = 2,
label = paste("R2: ", round(correlation, 2), "\n",
"CCC=", round(unlist(CCC_value[1])[1], 2), "\n",
"PEAR=", round(PEAR, 2), "\n",
"SPEAR=", round(SPEAR, 2), "\n",
paste("n=", nrow(pred_dataFIN), sep = ""), sep = ""),
hjust = 0) +
ggtitle(paste(scenarios[[scenario_name]]$name, "(log scale)")) +
theme_bw() +
scale_color_viridis_c() +
theme(
panel.grid.major = element_blank(),
axis.text = element_text(size = 12),
axis.title = element_text(size = 12),
panel.grid.minor = element_blank(),
aspect.ratio = 1
)
print(p_log)
ggsave(p_log, filename = paste0("Prediction_vs_real_log-", scenario_name, ".pdf"), width = 6, height = 6, dpi = 300)
# Predictions on test set (original scale)
pred_dataFIN_orig <- pred_dataFIN %>%
mutate(
sRNAdata_average_YB = 10^sRNAdata_average_YB,
PRED = 10^PRED
)
correlation_orig <- lm(pred_dataFIN_orig$PRED ~ pred_dataFIN_orig$sRNAdata_average_YB) %>%
summary() %>% .$r.squared
CCC_value_orig <- CCC(pred_dataFIN_orig$sRNAdata_average_YB, pred_dataFIN_orig$PRED)
PEAR_orig <- cor.test(pred_dataFIN_orig$sRNAdata_average_YB, pred_dataFIN_orig$PRED, method = "pearson")$estimate
SPEAR_orig <- SpearmanRho(pred_dataFIN_orig$sRNAdata_average_YB, pred_dataFIN_orig$PRED)
p_orig <- pred_dataFIN_orig %>%
ggplot(aes(x = sRNAdata_average_YB, y = PRED)) +
geom_point_rast(size = 0.5, alpha = 0.3) +
geom_density_2d(color = "red", alpha = 0.5) +
geom_smooth(method = "lm") +
geom_abline(intercept = 0, slope = 1) +
scale_x_log10(limits = c(0.01, 5000)) +
scale_y_log10(limits = c(0.01, 5000)) +
annotation_logticks(sides = "lb", outside = FALSE) +
coord_cartesian(clip = "off") +
annotate("text", x = 0.01, y = 3,
label = paste("R2: ", round(correlation_orig, 2), "\n",
"CCC=", round(unlist(CCC_value_orig[1])[1], 2), "\n",
"PEAR=", round(PEAR_orig, 2), "\n",
"SPEAR=", round(SPEAR_orig, 2), "\n",
paste("n=", nrow(pred_dataFIN_orig), sep = ""), sep = ""),
hjust = 0) +
ggtitle(paste(scenarios[[scenario_name]]$name, "(original scale)")) +
theme_bw() +
scale_color_viridis_c() +
theme(
panel.grid.major = element_blank(),
axis.text = element_text(size = 12),
axis.title = element_text(size = 12),
panel.grid.minor = element_blank(),
aspect.ratio = 1
)
print(p_orig)
ggsave(p_orig, filename = paste0("Prediction_vs_real_orig-", scenario_name, ".pdf"), width = 6, height = 6, dpi = 300)
# Store plots
plot_list[[scenario_name]] <- list(
varimp = p_varimp,
log_scale = p_log,
orig_scale = p_orig
)
# Store metrics
results_list[[scenario_name]]$metrics <- list(
log_scale = list(R2 = correlation, CCC = unlist(CCC_value[1])[1],
PEAR = PEAR, SPEAR = SPEAR),
orig_scale = list(R2 = correlation_orig, CCC = unlist(CCC_value_orig[1])[1],
PEAR = PEAR_orig, SPEAR = SPEAR_orig)
)
}
##
##
## ========================================
## Training model: Base Model with ALL Variables
## ========================================
##
## Features used: 9
## Features: NUC_A, NUC_C, NUC_G, NUC_T, TTseq_RPKM, RNAseq_RPKM, WEIGHT, UTR_LENGTH, nPOS
##
## Loaded existing model for scenario: all_vari
##
##
##
## ========================================
## Training model: Without Nucleotide_Content
## ========================================
##
## Features used: 5
## Features: TTseq_RPKM, RNAseq_RPKM, WEIGHT, UTR_LENGTH, nPOS
##
## Loaded existing model for scenario: no_nuc
##
##
##
## ========================================
## Training model: Without RNAseq
## ========================================
##
## Features used: 8
## Features: NUC_A, NUC_C, NUC_G, NUC_T, TTseq_RPKM, WEIGHT, UTR_LENGTH, nPOS
##
## Loaded existing model for scenario: no_rna
##
##
##
## ========================================
## Training model: Without TTseq
## ========================================
##
## Features used: 8
## Features: NUC_A, NUC_C, NUC_G, NUC_T, RNAseq_RPKM, WEIGHT, UTR_LENGTH, nPOS
##
## Loaded existing model for scenario: no_tt
##
##
##
## ========================================
## Training model: Without RNAseq and TTseq
## ========================================
##
## Features used: 7
## Features: NUC_A, NUC_C, NUC_G, NUC_T, WEIGHT, UTR_LENGTH, nPOS
##
## Loaded existing model for scenario: no_rna_tt
##
##
##
## ========================================
## Training model: Without Positional information
## ========================================
##
## Features used: 8
## Features: NUC_A, NUC_C, NUC_G, NUC_T, TTseq_RPKM, RNAseq_RPKM, WEIGHT, UTR_LENGTH
##
## Loaded existing model for scenario: no_pos
# Print summary comparison
cat("\n\n========================================\n")
##
##
## ========================================
cat("SUMMARY COMPARISON\n")
## SUMMARY COMPARISON
cat("========================================\n\n")
## ========================================
#define variables
y <- VARI
x <- setdiff(names(train), y)
#remove FBgn from x
x = x[!grepl("FBgn", x)]
# x = x[!grepl("nPOS", x)]
# x = x[!grepl("WEIGHT", x)]
# x = x[!grepl("RNAseq_RPKM", x)]
# x = x[!grepl("diNUC", x)]
# x = x[!grepl("TTseq_RPKM", x)]
#load old model
# yb_ml <- h2o.loadModel("model_for_Fig3_26-06")
#train model
yb_ml_all <- h2o.automl(x = x, y = y,
training_frame = train,
max_models = 30,
seed = 3242,
nfolds = 0,
validation_frame = val,
weights_column = "WEIGHT",
leaderboard_frame = test,
include_algos = c("GBM"),
stopping_metric = "RMSE",
sort_metric = "RMSE"
)
## | | | 0% | |======== | 11% | |=================== | 28% | |======================================================================| 100%
#print leaderboard and extract leader
lb <- yb_ml_all@leaderboard
print(lb, n = nrow(lb))
## model_id rmse mse mae
## 1 GBM_grid_1_AutoML_29_20260209_125655_model_12 0.2532103 0.06411547 0.1915223
## 2 GBM_grid_1_AutoML_29_20260209_125655_model_14 0.2535529 0.06428905 0.1902692
## 3 GBM_5_AutoML_29_20260209_125655 0.2538453 0.06443741 0.1907608
## 4 GBM_grid_1_AutoML_29_20260209_125655_model_24 0.2545341 0.06478761 0.1937047
## 5 GBM_grid_1_AutoML_29_20260209_125655_model_20 0.2547599 0.06490259 0.1964397
## 6 GBM_grid_1_AutoML_29_20260209_125655_model_5 0.2548168 0.06493160 0.1936697
## 7 GBM_grid_1_AutoML_29_20260209_125655_model_7 0.2550664 0.06505885 0.1933082
## 8 GBM_3_AutoML_29_20260209_125655 0.2553175 0.06518700 0.1939062
## 9 GBM_2_AutoML_29_20260209_125655 0.2563150 0.06569740 0.1929557
## 10 GBM_grid_1_AutoML_29_20260209_125655_model_19 0.2573617 0.06623505 0.1937307
## 11 GBM_1_AutoML_29_20260209_125655 0.2574082 0.06625899 0.1943142
## 12 GBM_grid_1_AutoML_29_20260209_125655_model_13 0.2580403 0.06658479 0.1949764
## 13 GBM_grid_1_AutoML_29_20260209_125655_model_15 0.2586708 0.06691058 0.1957793
## 14 GBM_grid_1_AutoML_29_20260209_125655_model_18 0.2594795 0.06732963 0.1945422
## 15 GBM_4_AutoML_29_20260209_125655 0.2595975 0.06739087 0.1978789
## 16 GBM_grid_1_AutoML_29_20260209_125655_model_17 0.2597934 0.06749260 0.2001737
## 17 GBM_grid_1_AutoML_29_20260209_125655_model_4 0.2600178 0.06760924 0.2010658
## 18 GBM_grid_1_AutoML_29_20260209_125655_model_3 0.2609740 0.06810742 0.1999161
## 19 GBM_grid_1_AutoML_29_20260209_125655_model_23 0.2632167 0.06928305 0.1982176
## 20 GBM_grid_1_AutoML_29_20260209_125655_model_16 0.2641466 0.06977345 0.2052408
## 21 GBM_grid_1_AutoML_29_20260209_125655_model_11 0.2645565 0.06999016 0.1988558
## 22 GBM_grid_1_AutoML_29_20260209_125655_model_2 0.2663467 0.07094057 0.2028386
## 23 GBM_grid_1_AutoML_29_20260209_125655_model_10 0.2696715 0.07272272 0.2017055
## 24 GBM_grid_1_AutoML_29_20260209_125655_model_25 0.2712917 0.07359920 0.2085088
## 25 GBM_grid_1_AutoML_29_20260209_125655_model_9 0.2722569 0.07412382 0.2096419
## 26 GBM_grid_1_AutoML_29_20260209_125655_model_8 0.2723195 0.07415793 0.2101993
## 27 GBM_grid_1_AutoML_29_20260209_125655_model_22 0.2727514 0.07439335 0.2059555
## 28 GBM_grid_1_AutoML_29_20260209_125655_model_1 0.2740897 0.07512519 0.2069040
## 29 GBM_grid_1_AutoML_29_20260209_125655_model_6 0.2742863 0.07523300 0.2132088
## 30 GBM_grid_1_AutoML_29_20260209_125655_model_21 0.3208272 0.10293009 0.2499838
## rmsle mean_residual_deviance
## 1 0.09702621 0.06411547
## 2 0.09673560 0.06428905
## 3 0.09711611 0.06443741
## 4 0.09945092 0.06478761
## 5 0.09962617 0.06490259
## 6 0.09923710 0.06493160
## 7 0.09969652 0.06505885
## 8 0.09736962 0.06518700
## 9 0.09768783 0.06569740
## 10 0.09836633 0.06623505
## 11 0.09960194 0.06625899
## 12 0.09784400 0.06658479
## 13 0.09906711 0.06691058
## 14 0.10229589 0.06732963
## 15 0.09896965 0.06739087
## 16 0.10055985 0.06749260
## 17 0.10218689 0.06760924
## 18 0.10078450 0.06810742
## 19 0.09871070 0.06928305
## 20 0.10159930 0.06977345
## 21 0.09909401 0.06999016
## 22 0.10069429 0.07094057
## 23 0.10066704 0.07272272
## 24 0.10345572 0.07359920
## 25 0.10515590 0.07412382
## 26 0.10573962 0.07415793
## 27 0.10174767 0.07439335
## 28 0.10206149 0.07512519
## 29 0.10591233 0.07523300
## 30 0.12927222 0.10293009
##
## [30 rows x 6 columns]
yb_ml=yb_ml_all@leader
#determine best model based on CCC and not RMSE
leaderboard <- as.data.frame(yb_ml_all@leaderboard)
ccc_vec <- sapply(leaderboard$model_id, function(mid) {
model <- h2o.getModel(mid)
pred <- h2o.predict(model, test)
actual <- 10^as.vector(test[[y]])
predicted <- 10^as.vector(pred)
ccc_value = CCC(predicted, actual)
ccc_value = round(unlist(ccc_value[1])[1],5)
ccc_value
})
## | | | 0% | |======================================================================| 100%
## | | | 0% | |======================================================================| 100%
## | | | 0% | |======================================================================| 100%
## | | | 0% | |======================================================================| 100%
## | | | 0% | |======================================================================| 100%
## | | | 0% | |======================================================================| 100%
## | | | 0% | |======================================================================| 100%
## | | | 0% | |======================================================================| 100%
## | | | 0% | |======================================================================| 100%
## | | | 0% | |======================================================================| 100%
## | | | 0% | |======================================================================| 100%
## | | | 0% | |======================================================================| 100%
## | | | 0% | |======================================================================| 100%
## | | | 0% | |======================================================================| 100%
## | | | 0% | |======================================================================| 100%
## | | | 0% | |======================================================================| 100%
## | | | 0% | |======================================================================| 100%
## | | | 0% | |======================================================================| 100%
## | | | 0% | |======================================================================| 100%
## | | | 0% | |======================================================================| 100%
## | | | 0% | |======================================================================| 100%
## | | | 0% | |======================================================================| 100%
## | | | 0% | |======================================================================| 100%
## | | | 0% | |======================================================================| 100%
## | | | 0% | |======================================================================| 100%
## | | | 0% | |======================================================================| 100%
## | | | 0% | |======================================================================| 100%
## | | | 0% | |======================================================================| 100%
## | | | 0% | |======================================================================| 100%
## | | | 0% | |======================================================================| 100%
leaderboard$CCC <- ccc_vec
leaderboard = leaderboard[order(-leaderboard$CCC), ]
leaderboard
## model_id rmse mse mae
## 2 GBM_grid_1_AutoML_29_20260209_125655_model_14 0.2535529 0.06428905 0.1902692
## 11 GBM_1_AutoML_29_20260209_125655 0.2574082 0.06625899 0.1943142
## 14 GBM_grid_1_AutoML_29_20260209_125655_model_18 0.2594795 0.06732963 0.1945422
## 1 GBM_grid_1_AutoML_29_20260209_125655_model_12 0.2532103 0.06411547 0.1915223
## 3 GBM_5_AutoML_29_20260209_125655 0.2538453 0.06443741 0.1907608
## 7 GBM_grid_1_AutoML_29_20260209_125655_model_7 0.2550664 0.06505885 0.1933082
## 4 GBM_grid_1_AutoML_29_20260209_125655_model_24 0.2545341 0.06478761 0.1937047
## 10 GBM_grid_1_AutoML_29_20260209_125655_model_19 0.2573617 0.06623505 0.1937307
## 17 GBM_grid_1_AutoML_29_20260209_125655_model_4 0.2600178 0.06760924 0.2010658
## 21 GBM_grid_1_AutoML_29_20260209_125655_model_11 0.2645565 0.06999016 0.1988558
## 13 GBM_grid_1_AutoML_29_20260209_125655_model_15 0.2586708 0.06691058 0.1957793
## 6 GBM_grid_1_AutoML_29_20260209_125655_model_5 0.2548168 0.06493160 0.1936697
## 9 GBM_2_AutoML_29_20260209_125655 0.2563150 0.06569740 0.1929557
## 8 GBM_3_AutoML_29_20260209_125655 0.2553175 0.06518700 0.1939062
## 28 GBM_grid_1_AutoML_29_20260209_125655_model_1 0.2740897 0.07512519 0.2069040
## 22 GBM_grid_1_AutoML_29_20260209_125655_model_2 0.2663467 0.07094057 0.2028386
## 12 GBM_grid_1_AutoML_29_20260209_125655_model_13 0.2580403 0.06658479 0.1949764
## 19 GBM_grid_1_AutoML_29_20260209_125655_model_23 0.2632167 0.06928305 0.1982176
## 27 GBM_grid_1_AutoML_29_20260209_125655_model_22 0.2727514 0.07439335 0.2059555
## 5 GBM_grid_1_AutoML_29_20260209_125655_model_20 0.2547599 0.06490259 0.1964397
## 15 GBM_4_AutoML_29_20260209_125655 0.2595975 0.06739087 0.1978789
## 23 GBM_grid_1_AutoML_29_20260209_125655_model_10 0.2696715 0.07272272 0.2017055
## 20 GBM_grid_1_AutoML_29_20260209_125655_model_16 0.2641466 0.06977345 0.2052408
## 18 GBM_grid_1_AutoML_29_20260209_125655_model_3 0.2609740 0.06810742 0.1999161
## 16 GBM_grid_1_AutoML_29_20260209_125655_model_17 0.2597934 0.06749260 0.2001737
## 24 GBM_grid_1_AutoML_29_20260209_125655_model_25 0.2712917 0.07359920 0.2085088
## 29 GBM_grid_1_AutoML_29_20260209_125655_model_6 0.2742863 0.07523300 0.2132088
## 26 GBM_grid_1_AutoML_29_20260209_125655_model_8 0.2723195 0.07415793 0.2101993
## 25 GBM_grid_1_AutoML_29_20260209_125655_model_9 0.2722569 0.07412382 0.2096419
## 30 GBM_grid_1_AutoML_29_20260209_125655_model_21 0.3208272 0.10293009 0.2499838
## rmsle mean_residual_deviance CCC
## 2 0.09673560 0.06428905 0.76475
## 11 0.09960194 0.06625899 0.76458
## 14 0.10229589 0.06732963 0.76236
## 1 0.09702621 0.06411547 0.75900
## 3 0.09711611 0.06443741 0.75466
## 7 0.09969652 0.06505885 0.74710
## 4 0.09945092 0.06478761 0.73994
## 10 0.09836633 0.06623505 0.73955
## 17 0.10218689 0.06760924 0.73934
## 21 0.09909401 0.06999016 0.73707
## 13 0.09906711 0.06691058 0.73633
## 6 0.09923710 0.06493160 0.73570
## 9 0.09768783 0.06569740 0.73569
## 8 0.09736962 0.06518700 0.73507
## 28 0.10206149 0.07512519 0.73448
## 22 0.10069429 0.07094057 0.73336
## 12 0.09784400 0.06658479 0.73255
## 19 0.09871070 0.06928305 0.72999
## 27 0.10174767 0.07439335 0.72761
## 5 0.09962617 0.06490259 0.72163
## 15 0.09896965 0.06739087 0.71483
## 23 0.10066704 0.07272272 0.70892
## 20 0.10159930 0.06977345 0.70764
## 18 0.10078450 0.06810742 0.67027
## 16 0.10055985 0.06749260 0.66981
## 24 0.10345572 0.07359920 0.65740
## 29 0.10591233 0.07523300 0.63638
## 26 0.10573962 0.07415793 0.61646
## 25 0.10515590 0.07412382 0.60698
## 30 0.12927222 0.10293009 0.50805
LEADER = leaderboard[1,1]
yb_ml = h2o.getModel(LEADER)
#
# # #save leader model
# h2o.saveModel(filename="model_for_Fig3_26-06",object = yb_ml_all@leader, path = getwd(), force = TRUE)
#extract variable importance
model = yb_ml
exm <- h2o.explain(model, test, top_n_features=8)
exm
##
##
## Residual Analysis
## =================
##
## > Residual Analysis plots the fitted values vs residuals on a test dataset. Ideally, residuals should be randomly distributed. Patterns in this plot can indicate potential problems with the model selection, e.g., using simpler model than necessary, not accounting for heteroscedasticity, autocorrelation, etc. Note that if you see "striped" lines of residuals, that is an artifact of having an integer valued (vs a real valued) response variable.
##
##
## Learning Curve Plot
## ===================
##
## > Learning curve plot shows the loss function/metric dependent on number of iterations or trees for tree-based algorithms. This plot can be useful for determining whether the model overfits.
##
##
## Variable Importance
## ===================
##
## > The variable importance plot shows the relative importance of the most important variables in the model.
##
##
## SHAP Summary
## ============
##
## > SHAP summary plot shows the contribution of the features for each instance (row of data). The sum of the feature contributions and the bias term is equal to the raw prediction of the model, i.e., prediction before applying inverse link function.
##
##
## Partial Dependence Plots
## ========================
##
## > Partial dependence plot (PDP) gives a graphical depiction of the marginal effect of a variable on the response. The effect of a variable is measured in change in the mean response. PDP assumes independence between the feature for which is the PDP computed and the rest.
##
##
## Individual Conditional Expectations
## ===================================
##
## > An Individual Conditional Expectation (ICE) plot gives a graphical depiction of the marginal effect of a variable on the response. ICE plots are similar to partial dependence plots (PDP); PDP shows the average effect of a feature while ICE plot shows the effect for a single instance. This function will plot the effect for each decile. In contrast to the PDP, ICE plots can provide more insight, especially when there is stronger feature interaction.
#plot variable importance
#plot varimp as pdf
x=h2o.varimp(yb_ml)
p = x %>%
ggplot(aes(x=reorder(variable, scaled_importance), y=scaled_importance))+
geom_col()+
# scale_y_continuous( limits=c(0,1))+
coord_flip()+
theme_bw()+
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
axis.text = element_text(size = 12)
)
print(p)
ggsave("variable-importance.pdf", p, width = 6, height = 6, dpi = 300)
x=h2o.shap_summary_plot(yb_ml, test)
#predict on test set
model = yb_ml
pred <- h2o.predict(model, test)
## | | | 0% | |======================================================================| 100%
#calculate correlation between predicted and observed values
pred_dataFIN= as_tibble(test) %>%
bind_cols(PRED=as.vector(pred))
#calculate correlation
correlation <- cor.test(pred_dataFIN$sRNAdata_average_YB, pred_dataFIN$PRED)
#pseudo r squared of srna data vs pred
correlation = lm(pred_dataFIN$PRED ~ pred_dataFIN$sRNAdata_average_YB) %>% summary() %>% .$r.squared
#calculate lin's concordance correlation coefficient
library(DescTools)
CCC_value = CCC(pred_dataFIN$sRNAdata_average_YB, pred_dataFIN$PRED)
#calculate deviance for a linear model
model <- lm(PRED ~ sRNAdata_average_YB, data = pred_dataFIN)
deviance(model)
## [1] 105.6279
#only pearson correlation
PEAR=cor.test(pred_dataFIN$sRNAdata_average_YB, pred_dataFIN$PRED, method = "pearson")%>% .$estimate
SPEAR=SpearmanRho(pred_dataFIN$sRNAdata_average_YB, pred_dataFIN$PRED)
#plot predictions vs real data
p=pred_dataFIN %>%
ggplot(aes(x=sRNAdata_average_YB,y=PRED, label=FBgn )) +
geom_point_rast(size=0.5, alpha=0.3)+
geom_density_2d(color="red",alpha=0.5)+
geom_smooth(method="lm")+
#add the correlation coefficient to the plot
geom_abline(intercept = 0, slope = 1)+
scale_x_continuous(limits=c(-1,3.5))+
scale_y_continuous(limits=c(-1,3.5))+
annotate("text", x = -1, y = 2,
label = paste("R2: ", round(correlation, 2),"\n","CCC=",round(unlist(CCC_value[1])[1],2),"\n","PEAR=",round(PEAR,2),"\n","SPEAR=",round(SPEAR,2),"\n",paste("n=",nrow(pred_dataFIN),sep=""),sep=""),hjust = 0)+
theme_bw()+
scale_color_viridis_c()+
theme(
panel.grid.major = element_blank(),
axis.text = element_text(size = 12),
axis.title =element_text(size = 12),
panel.grid.minor = element_blank(),
aspect.ratio = 1
)
print(p)
## Warning: The following aesthetics were dropped during statistical transformation: label.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
## the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
## variable into a factor?
## `geom_smooth()` using formula = 'y ~ x'
## Warning: The following aesthetics were dropped during statistical transformation: label.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
## the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
## variable into a factor?
ggsave("Prediction_vs_real.log.pdf",p , width = 6, height = 6, dpi = 300)
## Warning: The following aesthetics were dropped during statistical transformation: label.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
## the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
## variable into a factor?
## `geom_smooth()` using formula = 'y ~ x'
## Warning: The following aesthetics were dropped during statistical transformation: label.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
## the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
## variable into a factor?
ggplotly(p, tooltip = c("FBgn", "sRNAdata_average_YB", "PRED")) %>%
layout(title = "Predicted vs Observed sRNA reads (log scale)")
## Warning: The following aesthetics were dropped during statistical transformation: label.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
## the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
## variable into a factor?
## `geom_smooth()` using formula = 'y ~ x'
## Warning: The following aesthetics were dropped during statistical transformation: label.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
## the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
## variable into a factor?
## Warning: Aspect ratios aren't yet implemented, but you can manually set a
## suitable height/width
## Warning: Aspect ratios aren't yet implemented, but you can manually set a
## suitable height/width
#predict on test set
model = yb_ml
pred <- h2o.predict(model, test)
## | | | 0% | |======================================================================| 100%
#calculate correlation between predicted and observed values
pred_dataFIN= as_tibble(test) %>%
bind_cols(PRED=as.vector(pred))
pred_dataFIN <- pred_dataFIN %>%
mutate(sRNAdata_average_YB = 10^sRNAdata_average_YB,
PRED = 10^PRED
) # Convert back to original scale
#calculate correlation
correlation <- cor.test(pred_dataFIN$sRNAdata_average_YB, pred_dataFIN$PRED)
#pseudo r squared of srna data vs pred
correlation = lm(pred_dataFIN$PRED ~ pred_dataFIN$sRNAdata_average_YB) %>% summary() %>% .$r.squared
#calculate lin's concordance correlation coefficient
library(DescTools)
CCC_value = CCC(pred_dataFIN$sRNAdata_average_YB, pred_dataFIN$PRED)
#calculate deviance for a linear model
model <- lm(PRED ~ sRNAdata_average_YB, data = pred_dataFIN)
deviance(model)
## [1] 17559010
#only pearson correlation
PEAR=cor.test(pred_dataFIN$sRNAdata_average_YB, pred_dataFIN$PRED, method = "pearson")%>% .$estimate
SPEAR=SpearmanRho(pred_dataFIN$sRNAdata_average_YB, pred_dataFIN$PRED)
#plot predictions vs real data
p=pred_dataFIN %>%
ggplot(aes(x=sRNAdata_average_YB,y=PRED )) +
geom_point_rast(size=0.5, alpha=0.3)+
geom_density_2d(color="red",alpha=0.5)+
geom_smooth(method="lm")+
#add the correlation coefficient to the plot
geom_abline(intercept = 0, slope = 1)+
scale_x_log10(limits=c(0.01,5000))+
scale_y_log10(limits=c(0.01,5000))+
annotate("text", x = 0.01, y = 3,
label = paste("R2: ", round(correlation, 2),"\n","CCC=",round(unlist(CCC_value[1])[1],2),"\n","PEAR=",round(PEAR,2),"\n","SPEAR=",round(SPEAR,2),"\n",paste("n=",nrow(pred_dataFIN),sep=""),sep=""),hjust = 0)+
theme_bw()+
scale_color_viridis_c()+
theme(
panel.grid.major = element_blank(),
axis.text = element_text(size = 12),
axis.title =element_text(size = 12),
panel.grid.minor = element_blank(),
aspect.ratio = 1
)
print(p)
## `geom_smooth()` using formula = 'y ~ x'
ggsave("Prediction_vs_real.pdf",p , width = 6, height = 6, dpi = 300)
## `geom_smooth()` using formula = 'y ~ x'
calibration_curve <- pred_dataFIN %>%
mutate(bin = ntile(log10(PRED), 10)) %>%
group_by(bin) %>%
summarize(
mean_predicted_probability = mean(log10(PRED)),
observed_frequency = mean(log10(sRNAdata_average_YB))
)
# Plot calibration curve
calibration_curve %>%
ggplot(aes(y=mean_predicted_probability, x=observed_frequency))+
geom_point()+
geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "red")+
scale_x_continuous(limits=c(0,3))+
scale_y_continuous(limits=c(0,3))+
labs(x = "Mean Observed per Bin (log10)", y = "Mean Predicted per Bin (log10)") +
theme_cowplot(14) +
theme(
aspect.ratio = 1,
axis.text.x = element_text(margin = margin(t = 9, unit = "pt")),
axis.text.y = element_text(margin = margin(r = 9, unit = "pt")),
panel.grid.minor = element_blank(),
panel.grid.major = element_blank(),
legend.position = "none",
# Black border on facet strips
strip.background = element_rect(fill = "white", color = "black", size = 1),
strip.text = element_text(face = "bold", color = "black")
)
pred_dataFIN %>%
separate(FBgn, c("ID","POS"), sep=":!:", remove=FALSE,convert=TRUE)%>%
mutate(
POSbin=case_when(
POS<10 ~ "start",
TRUE ~ "end"
)
)%>%
ggplot(aes(x=sRNAdata_average_YB,y=PRED ,color=nPOS)) +
geom_point_rast(size=0.5, alpha=0.3)+
geom_density_2d(color="red",alpha=0.5)+
geom_smooth(method="lm")+
#add the correlation coefficient to the plot
geom_abline(intercept = 0, slope = 1)+
scale_x_log10(limits=c(0.01,5000))+
scale_y_log10(limits=c(0.01,5000))+
annotate("text", x = 0.01, y = 3,
label = paste("R2: ", round(correlation, 2),"\n","CCC=",round(unlist(CCC_value[1])[1],2),"\n","PEAR=",round(PEAR,2),"\n","SPEAR=",round(SPEAR,2),"\n",paste("n=",nrow(pred_dataFIN),sep=""),sep=""),hjust = 0)+
theme_bw()+
facet_row(~POSbin)+
scale_color_viridis_c()+
theme(
panel.grid.major = element_blank(),
axis.text = element_text(size = 12),
axis.title =element_text(size = 12),
panel.grid.minor = element_blank(),
aspect.ratio = 1
)
## `geom_smooth()` using formula = 'y ~ x'
## Warning: The following aesthetics were dropped during statistical transformation:
## colour.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
## the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
## variable into a factor?
## Warning: The following aesthetics were dropped during statistical transformation:
## colour.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
## the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
## variable into a factor?
############################################################################################### ############################################################################################### ############################################################################################### # ########################### model on siYB tiles - ds600 ############################################### ############################################################################################### ############################################################################################### ############################################################################################### # model on siYB tiles
# load data
RAW = read_tsv("YBdep_tiles.100_0_100.txt", col_names = TRUE)%>%
# RAW = read_tsv("tiles.100_-600_750_quantified.allLIBs.txt", col_names = TRUE)%>%
#remove lost YB library
select(-contains("243686"))%>%
mutate(
TYPE = case_when(
grepl("CDS", CHR) ~ "CDS",
grepl("UTR", CHR) ~ "UTR",
grepl("CLUSTER", CHR) ~ "CLUSTER",
TRUE ~ "OTHER"
),
)%>%
{}
## Rows: 84980 Columns: 111
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (3): CHR, FBgn, STRAND
## dbl (108): START, STOP, X, thickSTART, thickSTOP, nPOS, RNAseq_RPKM, TTseq_R...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
NUCclass="NUC_"
# filter data
TABLEfilt= RAW %>%
filter(TYPE == "UTR" )%>%
filter(UNIQ > 50, RNAseq_RPKM>1,TTseq_RPKM>1, nPOS>6, UTR_LENGTH>1000 ) %>%
select(-`sRNAdata_average-YB_CLUSTERuniq`)%>%
{}
#calculate YB-dependency
TABLEfilt <- TABLEfilt %>%
mutate(
sRNAdata_average_YB = log10(`sRNAdata_average-YB`),
)%>%
filter(!is.na(sRNAdata_average_YB),!is.na(sRNAdata_average_YB),!is.infinite(sRNAdata_average_YB))%>%
{}
# Calculate percentiles
percentiles <- TABLEfilt %>%
select(FBgn,TYPE,contains("sRNAdata_average-YB")) %>%
pivot_longer(-c(FBgn,TYPE), names_to = "name", values_to = "value", names_ptypes = list(name = factor())) %>%
mutate(name = fct_inorder(name)) %>%
group_by(TYPE,name) %>%
summarise(q10 = quantile(value, 0.05),q25 = quantile(value, 0.25),q75 = quantile(value, 0.75), q90 = quantile(value, 0.95))
## `summarise()` has grouped output by 'TYPE'. You can override using the
## `.groups` argument.
# Join percentiles to original data
data <- TABLEfilt %>%
select(FBgn,TYPE,contains("sRNAdata_average-YB")) %>%
pivot_longer(-c(FBgn,TYPE), names_to = "name", values_to = "value", names_ptypes = list(name = factor())) %>%
mutate(name = fct_inorder(name)) %>%
left_join(percentiles, by = c("name", "TYPE"))
nLINES=nrow(data)
# Plot YB-depleted spread - length normalized
COLORS=paletteer_d("ggthemes::Winter")
p=TABLEfilt %>%
select(FBgn, TYPE, contains("sRNAdata_average-YB"),-contains("RAW"))%>%
pivot_longer(-c(FBgn,TYPE))%>%
ggplot(aes(x=name,y=value))+
geom_violin(size=0.1,alpha=0.1, fill="grey")+
annotate("text", x = Inf, y = Inf,
label = paste("n=",nLINES,sep=""),hjust = 1, vjust = 1)+
scale_y_log10()+
ylab("sRNA reads per 1000 nt")+
geom_hline(yintercept = 12.2, linetype="dashed")+
geom_hline(yintercept = 346, linetype="dashed")+
theme_bw()+
theme(
aspect.ratio = 1,
panel.grid.minor = element_blank(),
panel.grid.major = element_blank(),
legend.position = "none",
axis.text = element_text(size=12),
axis.title = element_text(size=14)
)
print(p)
# ggsave(p, filename = "YB-depleted-spread.pdf", width = 6, height = 8)
# Load the library
# Start the H2O cluster
# h2o.no_progress()
h2o.init(max_mem_size = "20g", nthreads = -1)
## Connection successful!
##
## R is connected to the H2O cluster:
## H2O cluster uptime: 1 days 13 hours
## H2O cluster timezone: Europe/Vienna
## H2O data parsing timezone: UTC
## H2O cluster version: 3.46.0.7
## H2O cluster version age: 10 months and 12 days
## H2O cluster name: H2O_started_from_R_dominik.handler_fuo893
## H2O cluster total nodes: 1
## H2O cluster total memory: 17.63 GB
## H2O cluster total cores: 11
## H2O cluster allowed cores: 11
## H2O cluster healthy: TRUE
## H2O Connection ip: localhost
## H2O Connection port: 54321
## H2O Connection proxy: NA
## H2O Internal Security: FALSE
## R Version: R version 4.3.2 (2023-10-31)
## Warning in h2o.clusterInfo():
## Your H2O cluster version is (10 months and 12 days) old. There may be a newer version available.
## Please download and install the latest version from: https://h2o-release.s3.amazonaws.com/h2o/latest_stable.html
h2o.removeAll()
#random splitting
VARI="sRNAdata_average_YB"
quantiles <- quantile(TABLEfilt$RNAseq_RPKM, probs = c(0.75, 0.90, 0.95))
TABLEfilt$WEIGHT <- ifelse(TABLEfilt$RNAseq_RPKM >= quantiles[3], 10, # Top 5%: weight = 10
ifelse(TABLEfilt$RNAseq_RPKM >= quantiles[2], 5, # Top 10%: weight = 5
ifelse(TABLEfilt$RNAseq_RPKM >= quantiles[1], 2, # Top 25%: weight = 2
1))) # Others: weight = 1
scaled_RNA <- (TABLEfilt$`sRNAdata_average-YB` - min(TABLEfilt$`sRNAdata_average-YB`)) /
(max(TABLEfilt$`sRNAdata_average-YB`) - min(TABLEfilt$`sRNAdata_average-YB`))*1.4
TABLEfilt$WEIGHT = 100^scaled_RNA
max_smallRNA <- max(TABLEfilt$`sRNAdata_average-YB`)
TABLEfilt$WEIGHT <- 100^(1/((max_smallRNA + 1) / (TABLEfilt$`sRNAdata_average-YB` + 1)))
p=TABLEfilt %>%
ggplot(aes(x=`sRNAdata_average-YB`, y=WEIGHT, text=FBgn)) +
geom_point()
print(p)
#splitting keeping genes together
TABLEmodel = TABLEfilt%>%
filter(TYPE=="UTR",nPOS>6)%>%
select(FBgn, VARI,starts_with(NUCclass), TTseq_RPKM,CHR,RNAseq_RPKM,WEIGHT, UTR_LENGTH, nPOS)%>%
# select(FBgn, VARI, starts_with(NUCclass), TTseq_RPKM,CHR,RNAseq_RPKM,WEIGHT )%>%
{}
set.seed(12) # reproducible
unique_chr <- unique(TABLEmodel$CHR)
train_chr <- sample(unique_chr,
size = floor(0.8 * length(unique_chr))) # ≈ 75 % of chromosomes
residual_chr <- setdiff(unique_chr, train_chr) # Remaining chromosomes
val_chr <- sample(residual_chr,
size = floor(0.5 * length(residual_chr))) # ≈ 75 % of chromosomes
test_chr <- setdiff(residual_chr, val_chr)
# --- 3. Split the rows on those chromosome sets -------------------------
train_df <- TABLEmodel %>% filter(CHR %in% train_chr)
val_df <- TABLEmodel %>% filter(CHR %in% val_chr)
test_df <- TABLEmodel %>% filter(CHR %in% test_chr)
# --- 4. Drop CHR (if you don’t want it as a feature) and send to H2O ----
train <- as.h2o(train_df %>% select(-CHR), destination_frame = "train.hex")
## | | | 0% | |======================================================================| 100%
val <- as.h2o(val_df %>% select(-CHR), destination_frame = "val.hex")
## | | | 0% | |======================================================================| 100%
test <- as.h2o(test_df %>% select(-CHR), destination_frame = "test.hex")
## | | | 0% | |======================================================================| 100%
# --- 5. (Optional) inspect sizes ----------------------------------------
n_train <- nrow(train)
n_val <- nrow(val)
n_test <- nrow(test)
percent_train = round(n_train *100 / sum(n_train, n_val, n_test),2)
percent_val = round(n_val *100 / sum(n_train, n_val, n_test),2)
percent_test = round(n_test *100 / sum(n_train, n_val, n_test),2)
h2o.dim(train)
## [1] 5726 11
h2o.dim(val)
## [1] 636 11
h2o.dim(test)
## [1] 815 11
a=as.data.frame(train) %>%
ggplot(aes(x=!!sym(VARI)))+
geom_histogram()+
# scale_x_log10(limits=c(1,1000))+
annotate("text", x = 0, y = 200, label =paste("n=",n_train,"\n[%]=",percent_train,sep=""))
b=as.data.frame(test)%>%
ggplot(aes(x=!!sym(VARI)))+
geom_histogram()+
# scale_x_log10(limits=c(1,1000))+
annotate("text", x = 0, y = 200, label =paste("n=",n_test,"\n[%]=",percent_test,sep=""))
c=as.data.frame(val) %>%
ggplot(aes(x=!!sym(VARI)))+
geom_histogram()+
# scale_x_log10(limits=c(1,1000))+
annotate("text", x = 0, y = 200, label =paste("n=",n_val,"\n[%]=",percent_val,sep=""))
# print( ggarrange( a,b,c, ncol=1))
PLOT=ggarrange( a,b,c, ncol=1)
print(PLOT)
ggsave(PLOT, filename = "split-distribution.ds600.pdf", width = 6, height = 8)
# Define the four scenarios
scenarios <- list(
all_vari = list(name = "Base Model with ALL Variables",
exclude = c()),
no_nuc = list(name = "Without Nucleotide_Content",
exclude = c("NUC","diNUC_", "triNUC_", "tetraNUC_")),
no_rna = list(name = "Without RNAseq",
exclude = c("RNAseq_RPKM")),
no_tt = list(name = "Without TTseq",
exclude = c("TTseq_RPKM")),
no_rna_tt = list(name = "Without RNAseq and TTseq",
exclude = c("RNAseq_RPKM", "TTseq_RPKM") ),
only_rna = list(name = "Only using RNAseq and TTseq",
exclude = c("NUC", "nPOS","UTR_LENGTH") ),
only_rna_G = list(name = "Only using RNAseq, TTseq and G nucleotide content",
exclude = c("NUC_T","NUC_C","NUC_A", "nPOS","UTR_LENGTH") ),
no_pos_utr = list(name = "Without Positional information and UTR length",
exclude = c( "nPOS","UTR_LENGTH") ),
no_pos = list(name = "Without Positional information",
exclude = c("nPOS"))
)
# # Define the four scenarios
# scenarios <- list(
# all_vari = list(name = "Base Model with ALL Variables",
# exclude = c()),
# only_rna_G = list(name = "Only using RNAseq and TTseq",
# exclude = c("NUC_T","NUC_C","NUC_A", "nPOS","UTR_LENGTH") ),
# only_rna = list(name = "Only using RNAseq and TTseq",
# exclude = c("NUC", "nPOS","UTR_LENGTH") )
# )
# Store results
results_list <- list()
plot_list <- list()
# Loop through each scenario
for (scenario_name in names(scenarios)) {
cat("\n\n========================================\n")
cat("Training model:", scenarios[[scenario_name]]$name, "\n")
cat("========================================\n\n")
# Define variables
y <- VARI
x <- setdiff(names(train), y)
# x <- x[!grepl("nPOS", x)]
# x <- x[!grepl("UTR_LENGTH", x)]
# Remove FBgn and WEIGHT
x <- x[!grepl("FBgn", x)]
# Remove features based on scenario
for (pattern in scenarios[[scenario_name]]$exclude) {
x <- x[!grepl(pattern, x)]
}
cat("Features used:", length(x), "\n")
cat("Features:", paste(x, collapse = ", "), "\n\n")
# #if no saved model esixts train new one
if (file.exists(paste0(WD,"/model_ablation_study_ds600_",scenario_name))) {
yb_ml <- h2o.loadModel(paste0(WD,"/model_ablation_study_ds600_",scenario_name))
cat("Loaded existing model for scenario:", scenario_name, "\n\n")
next
}else{
# Train model
yb_ml_scenario <- h2o.automl(
x = x, y = y,
training_frame = train,
max_models = 30,
seed = 3242,
nfolds = 0,
validation_frame = val,
weights_column = "WEIGHT",
leaderboard_frame = test,
include_algos = c("GBM"),
stopping_metric = "RMSE",
sort_metric = "RMSE"
)
# Get leaderboard and determine best model by CCC
leaderboard <- as.data.frame(yb_ml_scenario@leaderboard)
ccc_vec <- sapply(leaderboard$model_id, function(mid) {
model <- h2o.getModel(mid)
pred <- h2o.predict(model, test)
actual <- 10^as.vector(test[[y]])
predicted <- 10^as.vector(pred)
ccc_value <- CCC(predicted, actual)
ccc_value <- round(unlist(ccc_value[1])[1], 5)
ccc_value
})
leaderboard$CCC <- ccc_vec
leaderboard <- leaderboard[order(-leaderboard$CCC), ]
LEADER <- leaderboard[1, 1]
yb_ml <- h2o.getModel(LEADER)
#safe model
h2o.saveModel(filename=paste0(WD,"model_ablation_study_ds600_",scenario_name),object = yb_ml, path = getwd(), force = TRUE)
}
# Store model and leaderboard
results_list[[scenario_name]] <- list(
model = yb_ml
)
# Variable importance plot
varimp_data <- h2o.varimp(yb_ml)
p_varimp <- varimp_data %>%
ggplot(aes(x = reorder(variable, scaled_importance), y = scaled_importance)) +
geom_col() +
coord_flip() +
ggtitle(scenarios[[scenario_name]]$name) +
theme_bw() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
axis.text = element_text(size = 12)
)
print(p_varimp)
ggsave(p_varimp, filename = paste0("variable-importance_ds600-", scenario_name, ".pdf"), width = 6, height = 6, dpi = 300)
# Predictions on test set (log scale)
pred <- h2o.predict(yb_ml, test)
pred_dataFIN <- as_tibble(test) %>%
bind_cols(PRED = as.vector(pred))
correlation <- lm(pred_dataFIN$PRED ~ pred_dataFIN$sRNAdata_average_YB) %>%
summary() %>% .$r.squared
CCC_value <- CCC(pred_dataFIN$sRNAdata_average_YB, pred_dataFIN$PRED)
PEAR <- cor.test(pred_dataFIN$sRNAdata_average_YB, pred_dataFIN$PRED, method = "pearson")$estimate
SPEAR <- SpearmanRho(pred_dataFIN$sRNAdata_average_YB, pred_dataFIN$PRED)
p_log <- pred_dataFIN %>%
ggplot(aes(x = sRNAdata_average_YB, y = PRED)) +
geom_point_rast(size = 0.5, alpha = 0.3) +
geom_density_2d(color = "red", alpha = 0.5) +
geom_smooth(method = "lm") +
geom_abline(intercept = 0, slope = 1) +
scale_x_continuous(limits = c(0, 3.5)) +
scale_y_continuous(limits = c(0, 3.5)) +
annotation_logticks(sides = "lb", outside = FALSE) +
coord_cartesian(clip = "off") +
annotate("text", x = 1, y = 2,
label = paste("R2: ", round(correlation, 2), "\n",
"CCC=", round(unlist(CCC_value[1])[1], 2), "\n",
"PEAR=", round(PEAR, 2), "\n",
"SPEAR=", round(SPEAR, 2), "\n",
paste("n=", nrow(pred_dataFIN), sep = ""), sep = ""),
hjust = 0) +
ggtitle(paste(scenarios[[scenario_name]]$name, "(log scale)")) +
theme_bw() +
scale_color_viridis_c() +
theme(
panel.grid.major = element_blank(),
axis.text = element_text(size = 12),
axis.title = element_text(size = 12),
panel.grid.minor = element_blank(),
aspect.ratio = 1
)
print(p_log)
ggsave(p_log, filename = paste0("Prediction_vs_real_log_ds600-", scenario_name, ".pdf"), width = 6, height = 6, dpi = 300)
# Predictions on test set (original scale)
pred_dataFIN_orig <- pred_dataFIN %>%
mutate(
sRNAdata_average_YB = 10^sRNAdata_average_YB,
PRED = 10^PRED
)
correlation_orig <- lm(pred_dataFIN_orig$PRED ~ pred_dataFIN_orig$sRNAdata_average_YB) %>%
summary() %>% .$r.squared
CCC_value_orig <- CCC(pred_dataFIN_orig$sRNAdata_average_YB, pred_dataFIN_orig$PRED)
PEAR_orig <- cor.test(pred_dataFIN_orig$sRNAdata_average_YB, pred_dataFIN_orig$PRED, method = "pearson")$estimate
SPEAR_orig <- SpearmanRho(pred_dataFIN_orig$sRNAdata_average_YB, pred_dataFIN_orig$PRED)
p_orig <- pred_dataFIN_orig %>%
ggplot(aes(x = sRNAdata_average_YB, y = PRED)) +
geom_point_rast(size = 0.5, alpha = 0.3) +
geom_density_2d(color = "red", alpha = 0.5) +
geom_smooth(method = "lm") +
geom_abline(intercept = 0, slope = 1) +
scale_x_log10(limits = c(0.01, 5000)) +
scale_y_log10(limits = c(0.01, 5000)) +
annotation_logticks(sides = "lb", outside = FALSE) +
coord_cartesian(clip = "off") +
annotate("text", x = 0.01, y = 3,
label = paste("R2: ", round(correlation_orig, 2), "\n",
"CCC=", round(unlist(CCC_value_orig[1])[1], 2), "\n",
"PEAR=", round(PEAR_orig, 2), "\n",
"SPEAR=", round(SPEAR_orig, 2), "\n",
paste("n=", nrow(pred_dataFIN_orig), sep = ""), sep = ""),
hjust = 0) +
ggtitle(paste(scenarios[[scenario_name]]$name, "(original scale)")) +
theme_bw() +
scale_color_viridis_c() +
theme(
panel.grid.major = element_blank(),
axis.text = element_text(size = 12),
axis.title = element_text(size = 12),
panel.grid.minor = element_blank(),
aspect.ratio = 1
)
print(p_orig)
ggsave(p_orig, filename = paste0("Prediction_vs_real_orig_ds600-", scenario_name, ".pdf"), width = 6, height = 6, dpi = 300)
# Store plots
plot_list[[scenario_name]] <- list(
varimp = p_varimp,
log_scale = p_log,
orig_scale = p_orig
)
# Store metrics
results_list[[scenario_name]]$metrics <- list(
log_scale = list(R2 = correlation, CCC = unlist(CCC_value[1])[1],
PEAR = PEAR, SPEAR = SPEAR),
orig_scale = list(R2 = correlation_orig, CCC = unlist(CCC_value_orig[1])[1],
PEAR = PEAR_orig, SPEAR = SPEAR_orig)
)
}
##
##
## ========================================
## Training model: Base Model with ALL Variables
## ========================================
##
## Features used: 9
## Features: NUC_A, NUC_C, NUC_G, NUC_T, TTseq_RPKM, RNAseq_RPKM, WEIGHT, UTR_LENGTH, nPOS
##
## Loaded existing model for scenario: all_vari
##
##
##
## ========================================
## Training model: Without Nucleotide_Content
## ========================================
##
## Features used: 5
## Features: TTseq_RPKM, RNAseq_RPKM, WEIGHT, UTR_LENGTH, nPOS
##
## Loaded existing model for scenario: no_nuc
##
##
##
## ========================================
## Training model: Without RNAseq
## ========================================
##
## Features used: 8
## Features: NUC_A, NUC_C, NUC_G, NUC_T, TTseq_RPKM, WEIGHT, UTR_LENGTH, nPOS
##
## Loaded existing model for scenario: no_rna
##
##
##
## ========================================
## Training model: Without TTseq
## ========================================
##
## Features used: 8
## Features: NUC_A, NUC_C, NUC_G, NUC_T, RNAseq_RPKM, WEIGHT, UTR_LENGTH, nPOS
##
## Loaded existing model for scenario: no_tt
##
##
##
## ========================================
## Training model: Without RNAseq and TTseq
## ========================================
##
## Features used: 7
## Features: NUC_A, NUC_C, NUC_G, NUC_T, WEIGHT, UTR_LENGTH, nPOS
##
## Loaded existing model for scenario: no_rna_tt
##
##
##
## ========================================
## Training model: Only using RNAseq and TTseq
## ========================================
##
## Features used: 3
## Features: TTseq_RPKM, RNAseq_RPKM, WEIGHT
##
## Loaded existing model for scenario: only_rna
##
##
##
## ========================================
## Training model: Only using RNAseq, TTseq and G nucleotide content
## ========================================
##
## Features used: 4
## Features: NUC_G, TTseq_RPKM, RNAseq_RPKM, WEIGHT
##
## Loaded existing model for scenario: only_rna_G
##
##
##
## ========================================
## Training model: Without Positional information and UTR length
## ========================================
##
## Features used: 7
## Features: NUC_A, NUC_C, NUC_G, NUC_T, TTseq_RPKM, RNAseq_RPKM, WEIGHT
##
## Loaded existing model for scenario: no_pos_utr
##
##
##
## ========================================
## Training model: Without Positional information
## ========================================
##
## Features used: 8
## Features: NUC_A, NUC_C, NUC_G, NUC_T, TTseq_RPKM, RNAseq_RPKM, WEIGHT, UTR_LENGTH
##
## Loaded existing model for scenario: no_pos
# Print summary comparison
cat("\n\n========================================\n")
##
##
## ========================================
cat("SUMMARY COMPARISON\n")
## SUMMARY COMPARISON
cat("========================================\n\n")
## ========================================
1.11.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.1
############################################################################################### ############################################################################################### # ########################### apply model to CDS ############################################### ############################################################################################### ############################################################################################### ###############################################################################################
# filter data
TABLEfilt= RAW %>%
filter(TYPE == "CDS" )%>%
filter(UNIQ > 50, RNAseq_RPKM>1,TTseq_RPKM>1, nPOS>6) %>%
select(-`sRNAdata_average-YB_CLUSTERuniq`)%>%
separate(FBgn, c("ID"), sep=":!:", remove=FALSE,convert=TRUE)%>%
group_by(ID)%>%
mutate(
MAXpos = max(nPOS, na.rm = TRUE),
)%>%
filter(MAXpos>=8)%>%
{}
## Warning: Expected 1 pieces. Additional pieces discarded in 35949 rows [1, 2, 3, 4, 5, 6,
## 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, ...].
#calculate YB-dependency
TABLEfilt <- TABLEfilt %>%
mutate(
sRNAdata_average_YB = log10(`sRNAdata_average-YB`),
)%>%
filter(!is.na(sRNAdata_average_YB),!is.na(sRNAdata_average_YB),!is.infinite(sRNAdata_average_YB))%>%
{}
TABLEmodel = TABLEfilt%>%
select(FBgn, VARI,starts_with(NUCclass), TTseq_RPKM,CHR,RNAseq_RPKM)%>%
# select(FBgn, VARI, starts_with(NUCclass), TTseq_RPKM,CHR,RNAseq_RPKM,WEIGHT )%>%
{}
## Adding missing grouping variables: `ID`
set.seed(1238575) # reproducible
unique_chr <- unique(TABLEmodel$CHR)
train_chr <- sample(unique_chr,
size = floor(0.7 * length(unique_chr))) # ≈ 75 % of chromosomes
residual_chr <- setdiff(unique_chr, train_chr) # Remaining chromosomes
val_chr <- sample(residual_chr,
size = floor(0.5 * length(residual_chr))) # ≈ 75 % of chromosomes
test_chr <- setdiff(residual_chr, val_chr)
test <- TABLEmodel %>%
as.h2o()
## | | | 0% | |======================================================================| 100%
#predict on test set
model = yb_ml
pred <- h2o.predict(model, test)
## | | | 0% | |======================================================================| 100%
## Warning in doTryCatch(return(expr), name, parentenv, handler): Test/Validation
## dataset is missing column 'UTR_LENGTH': substituting in a column of NaN
#calculate correlation between predicted and observed values
pred_dataFIN= as_tibble(test) %>%
bind_cols(PRED=as.vector(pred))
#calculate correlation
correlation <- cor.test(pred_dataFIN$sRNAdata_average_YB, pred_dataFIN$PRED)
#pseudo r squared of srna data vs pred
correlation = lm(pred_dataFIN$PRED ~ pred_dataFIN$sRNAdata_average_YB) %>% summary() %>% .$r.squared
#calculate lin's concordance correlation coefficient
library(DescTools)
CCC_value = CCC(pred_dataFIN$sRNAdata_average_YB, pred_dataFIN$PRED)
#calculate deviance for a linear model
model <- lm(PRED ~ sRNAdata_average_YB, data = pred_dataFIN)
deviance(model)
## [1] 2834.103
#only pearson correlation
PEAR=cor.test(pred_dataFIN$sRNAdata_average_YB, pred_dataFIN$PRED, method = "pearson")%>% .$estimate
SPEAR=SpearmanRho(pred_dataFIN$sRNAdata_average_YB, pred_dataFIN$PRED)
#calculate lm for the offset calculation
#here I switch to Real ~ Pred
fit <- lm( sRNAdata_average_YB ~PRED , data = pred_dataFIN)
intercept <- fit$coefficients[1]
slope <- fit$coefficients[2]
#calculate data to plot model in ggplot
newdata <- data.frame(PRED = seq(min(pred_dataFIN$PRED), max(pred_dataFIN$PRED), length.out = 100))
newdata$x_pred <- predict(fit, newdata)
#plot predictions vs real data
p=pred_dataFIN %>%
ggplot(aes(x=sRNAdata_average_YB,y=PRED, label=FBgn )) +
geom_point_rast(size=0.5, alpha=0.3)+
geom_density_2d(color="red",alpha=0.5)+
#add the correlation coefficient to the plot
geom_abline(intercept = 0, slope = 1)+
geom_line(data=newdata, aes(y = PRED, x = x_pred,label=PRED), color = "red", linetype = "dotted", linewidth=2 ) +
scale_x_continuous(limits=c(-1,3.5))+
scale_y_continuous(limits=c(-1,3.5))+
annotate("text", x = -1, y = 2,
label = paste("R2: ", round(correlation, 2),"\n","CCC=",round(unlist(CCC_value[1])[1],2),"\n","PEAR=",round(PEAR,2),"\n","SPEAR=",round(SPEAR,2),"\n",paste("n=",nrow(pred_dataFIN),sep=""),sep=""),hjust = 0)+
theme_bw()+
scale_color_viridis_c()+
theme(
panel.grid.major = element_blank(),
axis.text = element_text(size = 12),
axis.title =element_text(size = 12),
panel.grid.minor = element_blank(),
aspect.ratio = 1
)
## Warning in geom_line(data = newdata, aes(y = PRED, x = x_pred, label = PRED), :
## Ignoring unknown aesthetics: label
print(p)
## Warning: Removed 24 rows containing non-finite outside the scale range
## (`stat_density2d()`).
## Warning: The following aesthetics were dropped during statistical transformation: label.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
## the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
## variable into a factor?
## Warning: Removed 24 rows containing missing values or values outside the scale range
## (`geom_point()`).
ggsave("Prediction_vs_real.log.CDS.pdf",p , width = 6, height = 6, dpi = 300)
## Warning: Removed 24 rows containing non-finite outside the scale range
## (`stat_density2d()`).
## Warning: The following aesthetics were dropped during statistical transformation: label.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
## the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
## variable into a factor?
## Warning: Removed 24 rows containing missing values or values outside the scale range
## (`geom_point()`).
#determine fit of the data and calibration
intercept=-1.12
#calibrate model according to the determined value
pred_dataFIN = pred_dataFIN %>%
mutate(PRED_scaled = intercept + PRED)
#calculate correlation
correlation <- cor.test( pred_dataFIN$PRED_scaled,pred_dataFIN$sRNAdata_average_YB)
#pseudo r squared of srna data vs pred
correlation = lm(pred_dataFIN$PRED_scaled ~ pred_dataFIN$sRNAdata_average_YB) %>% summary() %>% .$r.squared
#calculate lin's concordance correlation coefficient
library(DescTools)
CCC_value = CCC(pred_dataFIN$PRED_scaled,pred_dataFIN$sRNAdata_average_YB)
fit_calibrated <- lm( sRNAdata_average_YB ~ PRED_scaled , data = pred_dataFIN)
intercept_calibrated <- fit_calibrated$coefficients[1]
slope_calibrated <- fit_calibrated$coefficients[2]
#calculate data to plot model in ggplot
newdata_calibrated <- data.frame(PRED_scaled = seq(min(pred_dataFIN$PRED_scaled), max(pred_dataFIN$PRED_scaled), length.out = 100))
newdata_calibrated$x_pred <- predict(fit_calibrated, newdata_calibrated)
#only pearson correlation
PEAR=cor.test(pred_dataFIN$sRNAdata_average_YB, pred_dataFIN$PRED_scaled, method = "pearson")%>% .$estimate
SPEAR=SpearmanRho(pred_dataFIN$sRNAdata_average_YB, pred_dataFIN$PRED_scaled)
#plot predictions vs real data
p=pred_dataFIN %>%
ggplot(aes(x=sRNAdata_average_YB,y=PRED_scaled, label=paste(FBgn,RNAseq_RPKM,TTseq_RPKM,sep="_"))) +
geom_point_rast(size=0.5,alpha=0.3)+
geom_density_2d(color="darkgrey",alpha=0.5)+
# geom_smooth(method="lm")+
#add the correlation coefficient to the plot
geom_abline(intercept = 0, slope = 1)+
geom_line(data=newdata_calibrated, aes(y = PRED_scaled, x = x_pred,label=PRED_scaled), color = "red", linetype = "dotted", linewidth=2 ) +
scale_x_continuous(limits=c(-1,3.5))+
scale_y_continuous(limits=c(-1,3.5))+
# # facet_wrap(~nPOS)+
annotate("text", x = 0.5, y = 3,
label = paste("R2: ", round(correlation, 3),"\n","CCC=",round(unlist(CCC_value[1])[1],2),"\n","PEAR=",round(PEAR,2),"\n","SPEAR=",round(SPEAR,2),"\n",paste("n=",nrow(pred_dataFIN),sep=""),sep=""),hjust = 0)+
theme_bw()+
scale_color_viridis_d()+
theme(
panel.grid.major = element_blank(),
axis.text = element_text(size = 12),
axis.title =element_text(size = 12),
panel.grid.minor = element_blank(),
aspect.ratio = 1
)
## Warning in geom_line(data = newdata_calibrated, aes(y = PRED_scaled, x =
## x_pred, : Ignoring unknown aesthetics: label
print(p)
## Warning: Removed 24 rows containing non-finite outside the scale range
## (`stat_density2d()`).
## Warning: The following aesthetics were dropped during statistical transformation: label.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
## the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
## variable into a factor?
## Warning: Removed 24 rows containing missing values or values outside the scale range
## (`geom_point()`).
ggsave("Prediction_vs_real.log.CDS.calibrated.pdf",p , width = 6, height = 6, dpi = 300)
## Warning: Removed 24 rows containing non-finite outside the scale range
## (`stat_density2d()`).
## Warning: The following aesthetics were dropped during statistical transformation: label.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
## the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
## variable into a factor?
## Warning: Removed 24 rows containing missing values or values outside the scale range
## (`geom_point()`).
############################################################################################### ############################################################################################### ############################################################################################### # ########################### GL analysis ############################################### ############################################################################################### ############################################################################################### ###############################################################################################
# RAW = read_tsv("GL-analysis_origVAL.txt", col_names = TRUE) %>%
RAW = read_tsv("GL-analysis_newRNAseq.txt", col_names = TRUE) %>%
#remove lost YB library
mutate(
origNAME=FBgn
)%>%
separate(FBgn,c("FBgn","TYPE"),sep="_",remove=TRUE)%>%
separate(FBgn,c("FBgn2","NR"),sep="-",remove=FALSE, convert = TRUE)%>%
filter(LENGTH>200, UNIQ>50, !grepl("20A",FBgn), !grepl("77B", FBgn))%>%
# filter(
# if_any(c(AubAGO3,Wsh), ~ . > 0.01),
# if_all(c(AubAGO3,Wsh), ~ . > 0.001),
# LENGTH>200
# )%>%
mutate(
TYPE = case_when(
str_detect(TYPE,"glUTR") ~ "UTR",
TRUE ~ TYPE
),
TYPEdetail =
case_when(
str_detect(FBgn,"flam") ~ "CLUSTERsoma",
str_detect(FBgn,"20A") ~ "CLUSTERsoma",
str_detect(FBgn,"77B") ~ "CLUSTERsoma",
TRUE ~ TYPE
),
ID =
case_when(
str_detect(FBgn,"flam") ~ "flam",
str_detect(FBgn,"20A") ~ "20A",
str_detect(FBgn,"77B") ~ "77B",
TYPE == "CLUSTER" ~ "dsCLUSTER",
TRUE ~ "GENIC"
)
)%>%
# filter(sRNA_Wsh > 0.01 | sRNA_AubAGO3sh > 0.01 )%>%
{}
#
RAWnorm=RAW
RAWnorm %>%
ggplot(aes(x=sRNA_Wsh, y=sRNA_Wsh_new))+
geom_point_rast(size=0.3, alpha=0.3)+
scale_x_log10()+
scale_y_log10()+
theme_cowplot(14)+
geom_abline(intercept = 0,slope=1 , color="red")+
labs(x="piRNA count in Wsh (Kirsten-normalized)",
y="piRNA count in new Wsh (not normalized)")+
theme(legend.position = "none",
aspect.ratio = 1)
NORMfactor_Wsh = RAWnorm %>%
filter(sRNA_Wsh > 0, sRNA_Wsh_new > 0) %>%
mutate(RATIO = sRNA_Wsh_new / sRNA_Wsh)%>%
summarise(MEANRATIO = median(RATIO)) %>%
pull()
RAWnorm = RAWnorm %>%
mutate(
sRNA_Wsh_new_norm = sRNA_Wsh_new / NORMfactor_Wsh
)
RAWnorm %>%
ggplot(aes(x=sRNA_Wsh, y=sRNA_Wsh_new_norm))+
geom_point_rast(size=0.3, alpha=0.3)+
scale_x_log10()+
scale_y_log10()+
theme_cowplot(14)+
geom_abline(intercept = 0,slope=1 , color="red")+
labs(x="piRNA count in Wsh (Kirsten-normalized)",
y="piRNA count in new Wsh (normalized)")+
theme(legend.position = "none",
aspect.ratio = 1)
RAWnorm %>%
ggplot(aes(x=sRNA_AubAGO3sh, y=sRNA_AubAGO3sh_new))+
geom_point_rast(size=0.3, alpha=0.3)+
scale_x_log10()+
scale_y_log10()+
theme_cowplot(14)+
geom_abline(intercept = 0,slope=1 , color="red")+
labs(x="piRNA count in AubAGO3 sh (Kirsten-normalized)",
y="piRNA count in new AubAGO3 sh (not normalized)")+
theme(legend.position = "none",
aspect.ratio = 1)
NORMfactor_AubAGO3 = RAWnorm %>%
filter(sRNA_AubAGO3sh > 0, sRNA_AubAGO3sh_new > 0) %>%
mutate(RATIO = sRNA_AubAGO3sh_new / sRNA_AubAGO3sh)%>%
summarise(MEANRATIO = median(RATIO)) %>%
pull()
RAWnorm = RAWnorm %>%
mutate(
sRNA_AubAGO3sh_new_norm = sRNA_AubAGO3sh_new / NORMfactor_AubAGO3
)
RAWnorm %>%
ggplot(aes(x=sRNA_AubAGO3sh, y=sRNA_AubAGO3sh_new_norm))+
geom_point_rast(size=0.3, alpha=0.3)+
scale_x_log10()+
scale_y_log10()+
theme_cowplot(14)+
geom_abline(intercept = 0,slope=1 , color="red")+
labs(x="piRNA count in AubAGO3 sh (Kirsten-normalized)",
y="piRNA count in new AubAGO3 sh (normalized)")+
theme(legend.position = "none",
aspect.ratio = 1)
RAWnorm = RAWnorm %>%
mutate(
sRNA_Wsh_avg = (sRNA_Wsh_new_norm + sRNA_Wsh)/2,
sRNA_AubAGO3sh_avg = (sRNA_AubAGO3sh_new_norm + sRNA_AubAGO3sh)/2
)
GLcutoff=5
SOMAcutoff=0.20
TABLE = RAWnorm %>%
filter(
if_any(c(sRNA_Wsh_avg,sRNA_shPiwi_PiwiIP_Kirsten ), ~ . > 0),
if_all(c(sRNA_Wsh_avg,sRNA_shPiwi_PiwiIP_Kirsten), ~ . > 0),
LENGTH>200
)%>%
mutate(
RATIOsoma_vs_gl = (sRNA_Wsh_avg) / (sRNA_shPiwi_PiwiIP_Kirsten),
TISSUE = case_when(
RATIOsoma_vs_gl > GLcutoff ~ "GL",
RATIOsoma_vs_gl < SOMAcutoff ~ "SOMA",
TRUE ~ "MIXED"
),
TYPE = factor(TYPE, levels = c( "CDS", "UTR", "CLUSTER"))
)
TEST=TABLE %>%
filter(grepl("FBgn0003015", FBgn) | grepl("FBgn0283442", FBgn))%>%
select(RATIOsoma_vs_gl)%>%
pull()
TABLE %>%
filter( TYPE %in% c("CDS","UTR","CLUSTER"))%>%
mutate(
ID = case_when(
TYPE == "CDS" ~ "CDS",
TYPE == "UTR" ~ "UTR",
TYPE == "CLUSTER" ~ ID
)
) %>%
ggplot(aes(RATIOsoma_vs_gl, fill=ID))+
geom_histogram(bins=50)+
geom_vline(xintercept=TEST , color="black", linetype="dashed")+
scale_x_log10()+
annotation_logticks(sides = "b", outside=FALSE) +
facet_col(~TYPE, scales = "free_y")+
theme_cowplot(14)+
labs(x="RATIO (Wsh / PiwiIP)", y="Count")+
geom_vline(xintercept = GLcutoff, color="red")+
geom_vline(xintercept = SOMAcutoff, color="red")+
#fill using okabe ito
scale_fill_manual(values = c("CDS" = "#0274b3", "UTR" = "#343991" , "flam" = "#e6a025", "dsCLUSTER" = "#cb79a6" )) + # Add this line
theme(
)
ggsave( filename = "GL-soma-splitting.new.pdf", width = 8, height = 6)
# TABLE %>%
# filter( TYPE %in% c("CDS","UTR","CLUSTER"))%>%
# ggplot(aes(x=RATIOsoma_vs_gl,y=sRNA_shControl_PiwiIP_Kirsten, color=ID))+
# geom_point(size=0.3, alpha=0.3)+
# scale_x_log10()+
# scale_y_log10()+
# facet_col(~TYPE)+
# annotation_logticks(sides = "b", outside=FALSE) +
# theme_cowplot(14)+
# labs(x="RATIO (Wsh / PiwiIP)",
# y="piRNA count in wt gl-PIWI IPs")+
# geom_vline(xintercept = 5, color="red")+
# geom_vline(xintercept = 0.2, color="red")+
# #fill using okabe ito
# theme()
TABLE %>%
filter(TISSUE=="GL")%>%
select(origNAME)%>%
write_tsv("GL-genes.txt", col_names = FALSE)
TABLE %>%
filter(TISSUE=="SOMA")%>%
select(origNAME)%>%
write_tsv("SOMA-genes.txt", col_names = FALSE)
plotTABLE = TABLE %>%
filter(
if_any(c(sRNA_AubAGO3sh_avg, sRNA_Wsh_avg), ~ . > 0.000001),
if_all(c(sRNA_AubAGO3sh_avg, sRNA_Wsh_avg), ~ . > 0),
)%>%
mutate(
RATIOsoma = 1,
RATIOgl = sRNA_AubAGO3sh_avg / sRNA_Wsh_avg
)%>%
select(TISSUE, RATIOgl, RATIOsoma, TYPEdetail, TYPE, FBgn)%>%
pivot_longer(c(RATIOgl, RATIOsoma), names_to = "RATIO", values_to = "VALUE")
MEDIANs = plotTABLE %>%
group_by(RATIO, TYPEdetail)%>%
summarize(MEDIAN=median(VALUE, na.rm=TRUE))
p = plotTABLE %>%
mutate(
TYPEdetail = factor(TYPEdetail, levels = c("CDS", "UTR", "CLUSTER","CLUSTERsoma")),
)%>%
filter(!(TYPEdetail == "CLUSTERsoma" & RATIO == "RATIOgl"), ! RATIO=="RATIOsoma")%>%
ggplot(aes(x=TYPEdetail, y=VALUE, color=TYPE, label=paste(FBgn,TYPEdetail,sep=" ")))+
geom_quasirandom_rast(size=0.7)+
facet_row(~RATIO, scales="free_x")+
# Add median crossbars
stat_summary(
fun = median,
geom = "crossbar",
width = 0.5,
fatten = 1.5,
color = "red"
) +
stat_summary(
fun = median,
geom = "text",
aes(label = sprintf("%.2f", after_stat(y))),
vjust = -0.5,
size = 3,
color = "red"
)+
coord_cartesian(ylim = c(0.005, 100))+
scale_y_log10()+
annotation_logticks(sides = "l", outside=FALSE) +
scale_color_manual(values = c("CDS" = "#0274b3", "UTR" = "#343991" , "CLUSTER" = "#cb79a6" )) + # Add this line
geom_hline(yintercept = 1, color="red")+
theme_cowplot(14)+
labs(y="piRNA ratio (Aub/ Wsh)", x="Feature type")+
theme(
legend.position = "none"
)
print(p)
ggsave("GL-analysis_AubAGO3_vs_Wsh.pdf",p, width = 3.5, height = 6)
plotTABLE = TABLE %>%
filter(! TYPEdetail == "CLUSTERsoma" ) %>%
mutate(
RATIOzucPiwi = (sRNA_shZuc_PiwiIP_Jakob_new)/sRNA_shW_PiwiIP_Jakob_new,
RATIOzucAub = (sRNA_shZuc_AubIP_Jakob_new)/sRNA_shW_AubIP_Jakob_new,
)%>%
filter( LENGTH>200 ) %>%
mutate(
RATIOzucPiwi = case_when(
if_all(c(sRNA_shZuc_PiwiIP_Jakob_new,sRNA_shW_PiwiIP_Jakob_new), ~ . < 0.00001) ~ NA,
TRUE ~ RATIOzucPiwi
),
RATIOzucAub = case_when(
if_all(c(sRNA_shZuc_AubIP_Jakob_new,sRNA_shW_AubIP_Jakob_new), ~ . < 0.00001) ~ NA,
TRUE ~ RATIOzucAub
)
# if_any(c(sRNA_shZuc_PiwiIP_Jakob,sRNA_shW_PiwiIP_Jakob), ~ . > 0.0001),
# if_all(c(sRNA_shZuc_PiwiIP_Jakob,sRNA_shW_PiwiIP_Jakob), ~ . > 0.00001),
)%>%
select(FBgn, TISSUE, RATIOzucPiwi, RATIOzucAub, TYPEdetail, TYPE)%>%
pivot_longer(c(RATIOzucPiwi, RATIOzucAub), names_to = "RATIO", values_to = "VALUE")%>%
mutate(
TYPEdetail = factor(TYPEdetail, levels = c("CDS", "UTR", "CLUSTER","CLUSTERsoma")),
) %>%
filter( (RATIO == "RATIOzucPiwi" & TISSUE == "GL")| RATIO == "RATIOzucAub" ) %>%
{}
count_data = plotTABLE %>%
group_by( TYPEdetail, RATIO)%>%
summarize(COUNT=n())
p = plotTABLE %>%
ggplot(aes(x=TYPEdetail, y=VALUE, color=TYPE, label=paste(FBgn,TYPEdetail,sep=" "))) +
geom_quasirandom_rast(size=0.6) +
# Add median crossbars
stat_summary(
fun = median,
geom = "crossbar",
width = 0.5,
fatten = 1.5,
color = "red"
) +
stat_summary(
fun = median,
geom = "text",
aes(label = sprintf("%.2f", after_stat(y))),
vjust = -0.5,
size = 3,
color = "red"
) +
annotation_logticks(sides = "l", outside = FALSE) +
geom_text(data = count_data, aes(x = TYPEdetail, y = 0.005, label = COUNT),
color = "black", size = 3, vjust = 0) +
scale_y_log10() +
scale_color_manual(values = c("CDS" = "#0274b3", "UTR" = "#343991" , "CLUSTER" = "#cb79a6" )) +
facet_row(~RATIO, scales="free_x") +
geom_hline(yintercept = 1, color="red") +
theme_cowplot(14) +
labs(y="piRNA ratio (shZuc PiwiIP / shW PiwiIP)", x="Feature type")+
theme(
legend.position = "none"
)
p
p = plotTABLE %>%
mutate(
RATIOnum = case_when(
RATIO == "RATIOzucPiwi" ~ 1,
RATIO == "RATIOzucAub" ~ 2
)
)%>%
ggplot(aes(x=RATIO, y=VALUE )) +
geom_quasirandom_rast(
data = . %>% filter(TYPEdetail == "CDS"),
aes(x = as.numeric(RATIOnum) - 0.2, y = VALUE, color = TYPE),
width = 0.3, # controls horizontal spread
varwidth = FALSE, # fixed width
groupOnX = FALSE,
size = 0.7,
alpha = 1
) +
# UTR slightly to the right
geom_quasirandom_rast(
data = . %>% filter(TYPEdetail == "UTR"),
aes(x = as.numeric(RATIOnum) + 0.2, y = VALUE, color = TYPE),
width = 0.3,
varwidth = FALSE,
groupOnX = FALSE,
size = 0.7,
alpha = 1
) +
# CLUSTER centered on the original LIB positions (narrowed as well)
geom_quasirandom_rast(
data = . %>% filter(TYPEdetail == "CLUSTER"),
aes(x = as.numeric(RATIOnum), y = VALUE, color = TYPE),
width = 0.3,
varwidth = FALSE, # or TRUE if you like varwidth for clusters
groupOnX = TRUE,
size = 0.7,
alpha = 1
) +
geom_hline(yintercept = 1, linetype = "dashed", color = "red")+
labs(x="biogenesis-factor knocked-doww", y="fold-change compared to siGFP")+
scale_y_log10()+
annotation_logticks(sides = "l", outside=FALSE) +
# scale_color_igv()+
scale_color_manual(values = c("CDS" = "#0274b3", "UTR" = "#343991" , "CLUSTER" = "#cb79a6" )) + # Add this line
scale_size_identity()+
annotation_logticks(sides = "b", outside=FALSE) +
theme_cowplot(12)+
theme(
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
strip.background = element_blank(),
legend.position = "none",
)+
{}
print(p)
ggsave("GL-analysis_Zuc_vs_Wsh.pdf", width = 3.5, height = 6)
plotTABLE = TABLE %>%
select(TISSUE, sRNA_Wsh, AubIP, Ago3IP, sRNA_shW_PiwiIP_Jakob, sRNA_shW_AubIP_Jakob_new, sRNA_shW_AGO3IP_Jakob_new, TYPEdetail, TYPE, FBgn)%>%
mutate(
sRNA_shW_PiwiIP_Jakob = sRNA_shW_PiwiIP_Jakob / 2.9
)%>%
pivot_longer(c(sRNA_Wsh, AubIP, Ago3IP), names_to = "RATIO", values_to = "VALUE")%>%
filter(VALUE>0, TYPEdetail %in% c("CDS","UTR","CLUSTER"))
plotTABLE = plotTABLE %>%
mutate(
RATIO = factor(RATIO, levels = c("sRNA_Wsh", "AubIP", "Ago3IP" ))
)
MEDIANs = plotTABLE %>%
group_by(RATIO, TYPEdetail)%>%
summarize(MEDIAN=median(VALUE, na.rm=TRUE)*100000)
p = plotTABLE %>%
mutate(
TYPEdetail = factor(TYPEdetail, levels = c("CDS", "UTR", "CLUSTER","CLUSTERsoma")),1
)%>%
filter(!(TYPEdetail == "CLUSTERsoma" & RATIO == "RATIOgl"), ! (TYPEdetail=="CLUSTER" & RATIO=="RATIOsoma"))%>%
ggplot(aes(x=TYPEdetail, y=VALUE, color=TYPE, label=paste(FBgn,TYPEdetail,sep=" ")))+
geom_quasirandom_rast()+
facet_row(~RATIO, scales="free_x")+
# Add median crossbars
stat_summary(
fun = median,
geom = "crossbar",
width = 0.5,
fatten = 1.5,
color = "red"
) +
stat_summary(
fun = median,
geom = "text",
aes(label = sprintf("%.2f", after_stat(y))),
vjust = -0.5,
size = 3,
color = "red"
)+
scale_y_log10()+
scale_color_manual(values = c("CDS" = "#0274b3", "UTR" = "#343991" , "CLUSTER" = "#cb79a6" )) + # Add this line
theme_cowplot(14)+
labs(y="sRNA abundance", x="Feature type")
print(p)
ggsave("GL-analysis_PIWI_comparison.pdf",p, width = 5, height = 6)
RNAseq_GeTMM = read_tsv("RNAseq_GeTMM_triplicates.txt")
RNAseq_level = TABLE %>%
select(FBgn, TYPE, RNAseq_AubAGO3sh) %>% pivot_wider(names_from = TYPE, values_from = RNAseq_AubAGO3sh)
plotTABLE = TABLE %>%
# mutate(
# TISSUE = case_when(
# TISSUE == "GL" ~ "MIXED",
# TRUE ~ TISSUE
# )
# )%>%
left_join(RNAseq_GeTMM, by="FBgn") %>%
{}
# for (i in c("soma_wt","soma_yb", "SOMA_RNAseq","gl_wt","gl_wt_old","gl_aubAgo3","gl_aubAgo3_old","Wsh_vs_AubAGO3","GL_RNAseq")) {
# for (i in c("gl_aubAgo3", "gl_aubAgo3_GeTMM","GL_RNAseq")) {
for (i in c("gl_aubAgo3_GeTMM")) {
# 1) Select and rename to common x / y
if (i == "soma_wt") {
DATA <- plotTABLE %>%
select(FBgn, FBgn2, TYPE, TISSUE,
x = sRNA_wtOSC,
y = RNAseq_wtOSC)
limitsX=c(1e-5,1e1)
limitsY=c(1e-7,1e-2)
}
if (i == "soma_yb") {
DATA <- plotTABLE %>%
select(FBgn, FBgn2, TYPE,TISSUE,
x = sRNA_YBsiOSC,
y = RNAseq_YBsiOSC)
limitsX=c(1e-5,1e1)
limitsY=c(1e-7,1e-2)
}
if (i == "SOMA_RNAseq") {
DATA <- plotTABLE %>%
select(FBgn, FBgn2, TYPE,TISSUE,
x = RNAseq_wtOSC,
y = RNAseq_YBsiOSC)
limitsX=c(1e-7,1e-2)
limitsY=c(1e-7,1e-2)
}
if (i == "gl_wt") {
DATA <- plotTABLE %>%
select(FBgn, FBgn2, TYPE,TISSUE,
x = sRNA_Wsh_avg,
y = RNAseq_Wsh)
limitsX=c(1e-5,1e1)
limitsY=c(1e-7,1e-2)
}
if (i == "gl_wt_old") {
DATA <- plotTABLE %>%
select(FBgn, FBgn2, TYPE,TISSUE,
x = sRNA_Wsh_avg,
y = RNAseq_old_Wsh)
limitsX=c(1e-5,1e1)
limitsY=c(1e-7,1e-2)
}
if (i == "gl_aubAgo3") {
DATA <- plotTABLE %>%
select(FBgn, FBgn2, TYPE,TISSUE,
x = sRNA_AubAGO3sh_avg,
y = RNAseq_AubAGO3sh)
limitsX=c(1e-5,1e1)
limitsY=c(1e-8,1e-3)
}
if (i == "gl_aubAgo3_GeTMM") {
DATA <- plotTABLE %>%
select(FBgn, FBgn2, TYPE,TISSUE,
x = sRNA_AubAGO3sh_avg,
y = RNA_AubAGO3sh)
limitsX=c(1e-5,1e1)
limitsY=c(1e-8,1e-3)
}
if (i == "gl_aubAgo3_old") {
DATA <- plotTABLE %>%
select(FBgn, FBgn2, TYPE,TISSUE,
x = sRNA_AubAGO3sh_avg,
y = RNAseq_old_AubAGO3sh)
limitsX=c(1e-5,1e1)
limitsY=c(1e-8,1e-3)
}
if (i == "Wsh_vs_AubAGO3") {
DATA <- plotTABLE %>%
select(FBgn, FBgn2, TYPE,TISSUE,
x = sRNA_Wsh_avg,
y = sRNA_AubAGO3sh_avg)
limitsX=c(1e-5,1e1)
limitsY=c(1e-7,1e1)
}
if (i == "GL_RNAseq") {
DATA <- plotTABLE %>%
select(FBgn, FBgn2, TYPE,TISSUE,
x = RNAseq_AubAGO3sh,
y = RNA_AubAGO3sh)
limitsX=c(1e-7,1e-2)
limitsY=c(1e-7,1e-2)
}
# DATA_med <- DATA %>%
# group_by(FBgn) %>%
# mutate(
# y = median(y, na.rm = TRUE) # or store as new column: y_median = median(y)
# ) %>%
# ungroup()
# 2) Filter for log scale and add log variables
DATA_log <- DATA %>%
group_by(FBgn2) %>%
mutate(
y = mean(y, na.rm = TRUE),
)%>%
filter(x > 0, y > 0, !grepl("CLUSTER",TYPE), TISSUE != "SOMA") %>%
mutate(
logx = log10(x),
logy = log10(y),
TYPE = case_when(
TYPE == "glUTR" ~ "UTR",
TRUE ~ TYPE
),
TYPE="ALL"
)%>%
# filter(x > 0, logy >= 10e-5, !grepl("CLUSTER",TYPE)) %>%
{}
# 3) Statistics per TYPE in log space
cor_stats <- DATA_log %>%
group_by(TYPE) %>%
summarise(
spearman_log = cor(logx, logy, method = "spearman"),
pearson_log = cor(logx, logy, method = "pearson"),
medianX = median(x)*1000,
.groups = "drop"
)
lm_stats <- DATA_log %>%
group_by(TYPE) %>%
do({
fit <- lm(logy ~ logx, data = .)
tibble(
slope = coef(fit)[["logx"]],
intercept = coef(fit)[["(Intercept)"]],
r2_log = summary(fit)$r.squared,
p_slope = summary(fit)$coefficients["logx", "Pr(>|t|)"]
)
}) %>%
ungroup()
pos_stats <- DATA_log %>%
group_by(TYPE) %>%
summarise(
x_pos = 1e-5,
y_pos = 1e-2,
.groups = "drop"
)
stats <- cor_stats %>%
left_join(lm_stats, by = c("TYPE")) %>%
left_join(pos_stats, by = c("TYPE")) %>%
mutate(
label = sprintf("ρ_Spearman(log) = %.2f\nβ = %.2f, R²(log) = %.2f\nMedian_x = %.2f",
spearman_log, slope, r2_log, medianX)
)
correlation <- lm( DATA_log$logy ~ DATA_log$logx) %>%
summary() %>% .$r.squared
CCC_value <- CCC(DATA_log$logx , DATA_log$logy)
SPEAR <- SpearmanRho(DATA_log$logx , DATA_log$logy)
# 4) Plot
p <- ggplot(DATA_log, aes(x = x, y = y, label=FBgn)) +
geom_point_rast(alpha = 0.3, size = 0.8, shape=16) +
# geom_point(data=. %>% filter(grepl(c("FBgn0085400"), FBgn)), color="cyan", size=1.5)+
geom_smooth(method = "lm", se = FALSE, color = "red") +
geom_abline(intercept = 0, slope = 1, linetype = "dashed", color = "blue") +
# facet_row( ~ TYPE) +
annotation_logticks(sides = "lb", outside = FALSE) +
geom_text(
data = stats,
aes(x = 1e-2, y = 1e1, label = label),
inherit.aes = FALSE,
hjust = 1, vjust = 1, size = 3
) +
scale_x_log10(
name = "sRNA (log10)",
# limits=limitsX,
labels = scales::trans_format("log10", scales::math_format(10^.x))
) +
scale_y_log10(
name = "RNA-seq (log10)",
# limits=limitsY,
labels = scales::trans_format("log10", scales::math_format(10^.x))
) +
labs(title = i) +
theme_bw(base_size = 10) +
theme(
strip.background = element_rect(fill = "grey90", color = NA),
strip.text = element_text(face = "bold"),
aspect.ratio=1,
panel.grid.minor = element_blank()
)
print(p)
ggplotly(p, tooltip = c("label", "x", "y"))
ggsave(p, filename = paste0("correlation_sRNA_RNAseq_", i, ".pdf"), width = 8, height = 8, dpi = 300)
}
############################################################################################### ############################################################################################### ############################################################################################### # ########################### deregulated TE analysis ############################################### ############################################################################################### ############################################################################################### ###############################################################################################
transcriptQUANT = read_tsv("transcript_quantification.txt")%>%
#rename columns
rename(
"Wsh_sense" = `average-Wsh_nGFP-Piwi-PiwiIP_new_sense` ,
"Wsh_antisense" = `average-Wsh_nGFP-Piwi-PiwiIP_new_antisense` ,
"AubAGO3sh_sense" = `average-AUBshAGO3sh_nGFP-Piwi-PiwiIP_new_sense`,
"AubAGO3sh_antisense" = `average-AUBshAGO3sh_nGFP-Piwi-PiwiIP_new_antisense`
)%>%
mutate(
SR_Wsh = Wsh_sense / (Wsh_antisense ),
SR_AubAGO3sh = AubAGO3sh_sense / (AubAGO3sh_antisense ),
)
## Rows: 11159 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (1): ID
## dbl (8): average-AUBshAGO3sh_nGFP-Piwi-PiwiIP_new_sense, average-AUBshAGO3sh...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
{}
## NULL
deregTEs = read_tsv("GL-deregulatedTE.txt")%>%
mutate(STATUS="DEREGULATED")
## Rows: 64 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (1): ID
## dbl (3): AubAGO3_RNA, Wsh_RNA, DEREG
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
plotTABLE = transcriptQUANT %>%
filter(! grepl("FBgn",ID)) %>%
left_join(deregTEs, by=c("ID"="ID")) %>%
mutate(
STATUS = case_when(
is.na(STATUS) ~ "UNCHANGED",
TRUE ~ STATUS
)
) %>%
filter(
DEREG >10
)
{}
## NULL
p = plotTABLE %>%
select(ID,STATUS,SR_Wsh,SR_AubAGO3sh, DEREG)%>%
pivot_longer(cols=c(SR_Wsh,SR_AubAGO3sh), names_to="SAMPLE", values_to="VALUE")%>%
ggplot(aes(x=SAMPLE, y=VALUE, label=paste(ID, VALUE, sep="_")))+
geom_quasirandom()+
facet_row(~STATUS)+
theme_cowplot(14)+
scale_y_log10()+
annotation_logticks(sides = "l", outside=FALSE) +
stat_summary(
fun = median,
geom = "crossbar",
width = 0.5,
fatten = 1.5,
color = "red"
) +
stat_summary(
fun = median,
geom = "text",
aes(label = sprintf("%.2f", after_stat(y))),
vjust = -0.5,
size = 3,
color = "red"
)
ggplotly(p, tooltip = c("label", "x", "y"))
## Warning in geom2trace.default(dots[[1L]][[1L]], dots[[2L]][[1L]], dots[[3L]][[1L]]): geom_GeomLogticks() has yet to be implemented in plotly.
## If you'd like to see this geom implemented,
## Please open an issue with your example code at
## https://github.com/ropensci/plotly/issues
print(p)
ggsave(p, filename = "deregulatedTEs_Strand-ratio.pdf", width = 3.5, height = 4, dpi = 300)
p = plotTABLE %>%
select(ID,STATUS,SR_Wsh,SR_AubAGO3sh,DEREG,AubAGO3_RNA)%>%
pivot_longer(cols=c(SR_Wsh,SR_AubAGO3sh), names_to="SAMPLE", values_to="VALUE")%>%
ggplot(aes(x=SAMPLE, y=VALUE, color=log2(AubAGO3_RNA)))+
geom_point(aes( group=ID))+
geom_line(aes( group=ID), alpha=0.3)+
facet_row(~STATUS)+
theme_cowplot(14)+
scale_y_log10()+
annotation_logticks(sides = "l", outside=FALSE) +
stat_summary(
fun = median,
geom = "crossbar",
width = 0.5,
fatten = 1.5,
color = "red"
) +
stat_summary(
fun = median,
geom = "text",
aes(label = sprintf("%.2f", after_stat(y))),
vjust = -0.5,
size = 3,
color = "red"
)
plotTABLE %>%
pivot_longer(cols=c(SR_Wsh,SR_AubAGO3sh), names_to="SAMPLE", values_to="VALUE")%>%
ggplot(aes(x=DEREG, y=VALUE))+
geom_point()+
theme_cowplot(14)+
facet_row(~SAMPLE)+
scale_y_log10()+
annotation_logticks(sides = "l", outside=FALSE) +
geom_hline(yintercept = 1, color="red")
RAW = read_tsv("TEhist.txt")%>%
rename(
"Wsh_sense" = `average-Wsh_nGFP-Piwi-PiwiIP_new_sense` ,
"Wsh_antisense" = `average-Wsh_nGFP-Piwi-PiwiIP_new_antisense` ,
"AubAGO3sh_sense" = `average-AUBshAGO3sh_nGFP-Piwi-PiwiIP_new_sense`,
"AubAGO3sh_antisense" = `average-AUBshAGO3sh_nGFP-Piwi-PiwiIP_new_antisense`
)%>%
{}
# for(TE in deregTEs$ID){
TE="blood"
p = RAW %>%
filter(ID == TE)%>%
pivot_longer(
-c(ID,POS),
names_to = "SAMPLE",
values_to = "VALUE"
)%>%
mutate(
LIB = case_when(
grepl("Wsh_", SAMPLE) ~ "Wsh",
grepl("AubAGO3sh_", SAMPLE) ~ "AubAGO3sh"
),
VALUE = case_when(
VALUE < -500 ~ -550,
VALUE > 500 ~ 550,
TRUE ~ VALUE
)
)%>%
ggplot(aes(x=POS))+
geom_line(data=. %>% filter(grepl("_sense",SAMPLE)), aes(y=VALUE))+
geom_line(data=. %>% filter(grepl("_antisense",SAMPLE)), aes(y=VALUE))+
theme_cowplot(14)+
facet_col(~LIB)+
coord_cartesian(ylim=c(-500,500))+
geom_hline(yintercept = 1, color="grey")+
labs(title=TE, y="sRNA abundance", x="Position on TE")+
theme(legend.position = "none")
print(p)
NAME=paste("TEhist/",TE,".pdf",sep="")
ggsave(p, filename = paste0("TEhist/",TE,".pdf"), width = 8, height = 4, dpi = 300)
# }
############################################################################################### ############################################################################################### ############################################################################################### # ########################### GL stacked bar chart ############################################### ############################################################################################### ############################################################################################### ###############################################################################################
# read in the data
RAW = read_tsv("quantified-sources_new2.txt") %>%
filter(ANN != "miRNA")
## Rows: 423 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (6): GENO, TYPE, ANN, CLUSTER, SOURCE, TEtype
## dbl (1): COUNT
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
#print table to give overview
RAW
## # A tibble: 399 × 7
## GENO TYPE ANN COUNT CLUSTER SOURCE TEtype
## <chr> <chr> <chr> <dbl> <chr> <chr> <chr>
## 1 shPiwi_PiwiIP_Kirsten sRNA NONE 0 N <NA> <NA>
## 2 shControl_PiwiIP_Kirsten sRNA NONE 0 N <NA> <NA>
## 3 AUBshAGO3sh_nGFP-Piwi-PiwiIP_new sRNA 3UTR 5.23e5 N N <NA>
## 4 AUBshAGO3sh_nGFP-Piwi-PiwiIP_new sRNA 5UTR 2.90e4 N N <NA>
## 5 AUBshAGO3sh_nGFP-Piwi-PiwiIP_new sRNA CDS 7.65e5 N N <NA>
## 6 AUBshAGO3sh_nGFP-Piwi-PiwiIP_new sRNA INTRON 4.61e4 N N <NA>
## 7 AUBshAGO3sh_nGFP-Piwi-PiwiIP_new sRNA INTRON 6.49e1 Y N <NA>
## 8 AUBshAGO3sh_nGFP-Piwi-PiwiIP_new sRNA NONE 2.17e4 Y N <NA>
## 9 AUBshAGO3sh_nGFP-Piwi-PiwiIP_new sRNA NONE 3.33e5 N N <NA>
## 10 AUBshAGO3sh_nGFP-Piwi-PiwiIP_new sRNA TE:!:act… 3.34e3 Y N active
## # ℹ 389 more rows
TABLE = RAW %>%
separate(ANN, into=c("ANN","DETAIL"), sep=":", extra="merge") %>%
filter(GENO %in% c("wsh_GLKD_MAbAgo3-IP_ov", "wsh_GLKD_MAbAub-IP_ov", "Wsh_nGFP-Piwi-PiwiIP_old"),
ANN %in% c("CDS","3UTR","TE", "TE_AS"))%>%
group_by(GENO, ANN) %>%
summarise(
sumCOUNT = sum(COUNT)
) %>%
ungroup() %>%
# mutate(
# sumCOUNT = case_when(
# GENO == "wsh_GLKD_MAbAgo3-IP_ov" ~ sumCOUNT / 33.58,
# GENO == "wsh_GLKD_MAbAub-IP_ov" ~ sumCOUNT / 8.9,
# GENO == "Wsh_nGFP-Piwi-PiwiIP" ~ sumCOUNT
# )
# )%>%
{}
plotTABLE = TABLE %>%
group_by(ANN) %>%
mutate(
PERCENT = sumCOUNT * 100 / sum(sumCOUNT)
# PERCENT = sumCOUNT
) %>%
ungroup()
plotTABLE %>%
ggplot(aes(x = ANN, y = PERCENT, fill = GENO)) +
geom_col(position = "stack") +
# facet_wrap(~ SET, scales = "free_x") +
labs(
x = "Feature Type",
y = "Percentage (%)",
fill = "Sample",
title = "Sample Contribution to Each Feature Type"
) +
theme_bw() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
legend.position = "right"
)
df = RAW %>%
filter(GENO %in% c("wsh_GLKD_MAbAgo3-IP_ov", "wsh_GLKD_MAbAub-IP_ov", "Wsh_nGFP-Piwi-PiwiIP_old")) %>%
select(GENO,TYPE,ANN, COUNT)%>%
separate(ANN, into=c("ANN","DETAIL"), sep=":!:", extra="merge") %>%
mutate(
ANN = case_when(
ANN %in% c("CDS","5UTR","3UTR") ~ "GENIC",
TRUE ~ ANN
)
)%>%
group_by(GENO,ANN) %>%
summarise(
TOTAL = sum(COUNT)
)%>%
mutate(
TOTAL = case_when(
ANN == "INTRON" ~ 0,
# ANN == "TE_AS" ~ 0,
TRUE ~ TOTAL
)
)
## Warning: Expected 2 pieces. Missing pieces filled with `NA` in 26 rows [1, 2, 3, 4, 5,
## 6, 7, 8, 9, 18, 19, 20, 21, 22, 23, 24, 25, 26, 35, 36, ...].
## `summarise()` has grouped output by 'GENO'. You can override using the
## `.groups` argument.
df$ANN = factor(df$ANN, levels = c("GENIC","NONE","TE", "TE_AS" ))
p = ggplot(df, aes(y = TOTAL, axis1 = ANN, axis2 = GENO)) +
geom_alluvium(aes(fill = ANN), width = 1/12) +
geom_stratum(width = 1/4, aes(fill=ANN)) +
geom_text(stat = "stratum", aes(label = after_stat(stratum)), size = 3) +
scale_x_discrete(limits = c("Annotation", "Genotype"), expand = c(.05, .05)) +
theme_minimal() +
labs(title = "Distribution across Genotypes",
y = "") +
theme(axis.text.y = element_blank(),
axis.ticks.y = element_blank(),
panel.grid = element_blank())
print(p)
ggsave(paste("stackedBars_GL_PIWI-IPs.pdf",sep=""), p, width = 3, height = 5, units = "in", dpi = 300)
p = df %>%
ggplot(aes(x=ANN, y=TOTAL))+
geom_bar(aes(fill=GENO), stat="identity", position="fill")
print(p)
## Warning: Removed 3 rows containing missing values or values outside the scale range
## (`geom_bar()`).
ggsave("stackedBar_GL_PIWI-IPs_v2.pdf", p, width = 5, height = 5, units = "in", dpi = 300)
## Warning: Removed 3 rows containing missing values or values outside the scale range
## (`geom_bar()`).
PLOTtable = RAW %>%
filter(GENO %in% c("WT", " ", "Wsh_nGFP-Piwi-PiwiIP_old")) %>%
separate(ANN, c("ANN","TEtype"), sep = ":!:", remove = TRUE)%>%
mutate(
COUNT = case_when(
ANN == "INTRON" ~ 0,
ANN == "TE_AS" ~ 0,
ANN == "TE" ~ 0,
ANN == "NONE" ~ 0,
TRUE ~ COUNT
)
)%>%
group_by(GENO, TYPE,ANN) %>%
summarise(COUNT = sum(COUNT)) %>%
mutate(
COUNT = COUNT / sum(COUNT)
)
PLOTtable$ANN = factor(PLOTtable$ANN, levels = c("INTRON" ,"5UTR","CDS", "3UTR","NONE","TE", "TE_AS" ))
p = PLOTtable %>%
ggplot(aes(x=GENO, y=COUNT, alluvium=ANN, fill=ANN))+
scale_x_discrete(expand = c(.2, .05)) +
geom_bar(stat = "identity", width = 0.2, color = "white", size = 0.2) +
geom_alluvium(width = 0.2, alpha = 0.7,
curve_type = "sigmoid")+ # smoother curves
scale_fill_scico_d(palette = "navia", direction = -1) +
scale_y_continuous(labels = scales::percent_format()) +
theme_cowplot(12) +
theme(
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
legend.position = "right",
axis.text.x = element_text(angle = 45, hjust = 1) # if labels are long
) +
labs(x = NULL, y = "Proportion", fill = "Annotation")
print(p)
ggsave(paste("stackedBars_OSC_vs_GL-Piwi.pdf",sep=""), p, width = 3, height = 5, units = "in", dpi = 300)